[
    {
        "id": "authors:m8ygm-cy673",
        "collection": "authors",
        "collection_id": "m8ygm-cy673",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20230316-204015123",
        "type": "monograph",
        "title": "A Finite-Sample Analysis of Payoff-Based Independent Learning in Zero-Sum Stochastic Games",
        "author": [
            {
                "family_name": "Chen",
                "given_name": "Zaiwei",
                "orcid": "0000-0001-9915-5595",
                "clpid": "Chen-Zaiwei"
            },
            {
                "family_name": "Zhang",
                "given_name": "Kaiqing",
                "clpid": "Zhang-Kaiqing"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Ozdaglar",
                "given_name": "Asuman",
                "orcid": "0000-0002-1827-1285",
                "clpid": "Ozdaglar-Asuman-E"
            },
            {
                "family_name": "Wierman",
                "given_name": "Adam",
                "orcid": "0000-0002-5923-0199",
                "clpid": "Wierman-A"
            }
        ],
        "abstract": "We study two-player zero-sum stochastic games, and propose a form of independent learning dynamics called Doubly Smoothed Best-Response dynamics, which integrates a discrete and doubly smoothed variant of the best-response dynamics into temporal-difference (TD)-learning and minimax value iteration. The resulting dynamics are payoff-based, convergent, rational, and symmetric among players. Our main results provide finite-sample guarantees. In particular, we prove the first-known O\u0305(1/\u03f5\u00b2) sample complexity bound for payoff-based independent learning dynamics, up to a smoothing bias. In the special case where the stochastic game has only one state (i.e., matrix games), we provide a sharper O\u0305(1/\u03f5) sample complexity. Our analysis uses a novel coupled Lyapunov drift approach to capture the evolution of multiple sets of coupled and stochastic iterates, which might be of independent interest.",
        "publisher": "arXiv",
        "publication_date": "2023-03-03"
    },
    {
        "id": "authors:8fa1j-r0853",
        "collection": "authors",
        "collection_id": "8fa1j-r0853",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20230316-204028845",
        "type": "monograph",
        "title": "Algorithmic Collective Action in Machine Learning",
        "author": [
            {
                "family_name": "Hardt",
                "given_name": "Moritz",
                "clpid": "Hardt-Moritz"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Mendler-D\u00fcnner",
                "given_name": "Celestine",
                "orcid": "0000-0002-9880-7173",
                "clpid": "Mendler-D\u00fcnner-Celestine"
            },
            {
                "family_name": "Zrnic",
                "given_name": "Tijana",
                "clpid": "Zrnic-Tijana"
            }
        ],
        "abstract": "We initiate a principled study of algorithmic collective action on digital platforms that deploy machine learning algorithms. We propose a simple theoretical model of a collective interacting with a firm's learning algorithm. The collective pools the data of participating individuals and executes an algorithmic strategy by instructing participants how to modify their own data to achieve a collective goal. We investigate the consequences of this model in three fundamental learning-theoretic settings: the case of a nonparametric optimal learning algorithm, a parametric risk minimizer, and gradient-based optimization. In each setting, we come up with coordinated algorithmic strategies and characterize natural success criteria as a function of the collective's size. Complementing our theory, we conduct systematic experiments on a skill classification task involving tens of thousands of resumes from a gig platform for freelancers. Through more than two thousand model training runs of a BERT-like language model, we see a striking correspondence emerge between our empirical observations and the predictions made by our theory. Taken together, our theory and experiments broadly support the conclusion that algorithmic collectives of exceedingly small fractional size can exert significant control over a platform's learning algorithm.",
        "publisher": "arXiv",
        "publication_date": "2023-02-08"
    },
    {
        "id": "authors:445x7-d7434",
        "collection": "authors",
        "collection_id": "445x7-d7434",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20230316-204025426",
        "type": "monograph",
        "title": "Convergent First-Order Methods for Bi-level Optimization and Stackelberg Games",
        "author": [
            {
                "family_name": "Maheshwari",
                "given_name": "Chinmay",
                "orcid": "0000-0003-3596-2851",
                "clpid": "Maheshwari-Chinmay"
            },
            {
                "family_name": "Sasty",
                "given_name": "S. Shankar",
                "clpid": "Sasty-S-Shankar"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            }
        ],
        "abstract": "We propose an algorithm to solve a class of bi-level optimization problems using only first-order information. In particular, we focus on a class where the inner minimization has unique solutions. Unlike contemporary algorithms, our algorithm does not require the use of an oracle estimator for the gradient of the bi-level objective or an approximate solver for the inner problem. Instead, we alternate between descending on the inner problem using na\u00efve optimization methods and descending on the upper-level objective function using specially constructed gradient estimators. We provide non-asymptotic convergence rates to stationary points of the bi-level objective in the absence of convexity of the closed-loop function and further show asymptotic convergence to only local minima of the bi-level problem. The approach is inspired by ideas from the literature on two-timescale stochastic approximation algorithms.",
        "publisher": "arXiv",
        "publication_date": "2023-02-02"
    },
    {
        "id": "authors:pnp9k-akm20",
        "collection": "authors",
        "collection_id": "pnp9k-akm20",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20221219-234102223",
        "type": "monograph",
        "title": "Synthesizing Reactive Test Environments for Autonomous Systems: Testing Reach-Avoid Specifications with Multi-Commodity Flows",
        "author": [
            {
                "family_name": "Badithela",
                "given_name": "Apurva",
                "clpid": "Badithela-Apurva"
            },
            {
                "family_name": "Graebener",
                "given_name": "Josefine B.",
                "clpid": "Graebener-Josefine-B"
            },
            {
                "family_name": "Ubellacker",
                "given_name": "Wyatt",
                "orcid": "0000-0002-4732-6185",
                "clpid": "Ubellacker-Wyatt-L"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric V.",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Ames",
                "given_name": "Aaron D.",
                "orcid": "0000-0003-0848-3177",
                "clpid": "Ames-A-D"
            },
            {
                "family_name": "Murray",
                "given_name": "Richard M.",
                "orcid": "0000-0002-5785-7481",
                "clpid": "Murray-R-M"
            }
        ],
        "abstract": "We study automated test generation for verifying discrete decision-making modules in autonomous systems. We utilize linear temporal logic to encode the requirements on the system under test in the system specification and the behavior that we want to observe during the test is given as the test specification which is unknown to the system. First, we use the specifications and their corresponding non-deterministic B\u00fcchi automata to generate the specification product automaton. Second, a virtual product graph representing the high-level interaction between the system and the test environment is constructed modeling the product automaton encoding the system, the test environment, and specifications. The main result of this paper is an optimization problem, framed as a multi-commodity network flow problem, that solves for constraints on the virtual product graph which can then be projected to the test environment. Therefore, the result of the optimization problem is reactive test synthesis that ensures that the system meets the test specifications along with satisfying the system specifications. This framework is illustrated in simulation on grid world examples, and demonstrated on hardware with the Unitree A1 quadruped, wherein dynamic locomotion behaviors are verified in the context of reactive test environments.",
        "doi": "10.48550/arXiv.2210.10304",
        "publisher": "arXiv",
        "publication_date": "2022-10-19"
    },
    {
        "id": "authors:e6mv5-zqt31",
        "collection": "authors",
        "collection_id": "e6mv5-zqt31",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20221220-221907545",
        "type": "monograph",
        "title": "A Note on Zeroth-Order Optimization on the Simplex",
        "author": [
            {
                "family_name": "Zrnic",
                "given_name": "Tijana",
                "clpid": "Zrnic-Tijana"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            }
        ],
        "abstract": "We construct a zeroth-order gradient estimator for a smooth function defined on the probability simplex. The proposed estimator queries the simplex only. We prove that projected gradient descent and the exponential weights algorithm, when run with this estimator instead of exact gradients, converge at a O(T^(-1/4)}) rate.",
        "doi": "10.48550/arXiv.2208.01185",
        "publisher": "arXiv",
        "publication_date": "2022-08-02"
    },
    {
        "id": "authors:89c54-60249",
        "collection": "authors",
        "collection_id": "89c54-60249",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20220715-171641949",
        "type": "monograph",
        "title": "Decentralized, Communication- and Coordination-free Learning in Structured Matching Markets",
        "author": [
            {
                "family_name": "Maheshwari",
                "given_name": "Chinmay",
                "clpid": "Maheshwari-Chinmay"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Sastry",
                "given_name": "Shankar",
                "clpid": "Sastry-S-Shankar"
            }
        ],
        "abstract": "We study the problem of online learning in competitive settings in the context of two-sided matching markets. In particular, one side of the market, the agents, must learn about their preferences over the other side, the firms, through repeated interaction while competing with other agents for successful matches. We propose a class of decentralized, communication- and coordination-free algorithms that agents can use to reach to their stable match in structured matching markets. In contrast to prior works, the proposed algorithms make decisions based solely on an agent's own history of play and requires no foreknowledge of the firms' preferences. Our algorithms are constructed by splitting up the statistical problem of learning one's preferences, from noisy observations, from the problem of competing for firms. We show that under realistic structural assumptions on the underlying preferences of the agents and firms, the proposed algorithms incur a regret which grows at most logarithmically in the time horizon. Our results show that, in the case of matching markets, competition need not drastically affect the performance of decentralized, communication and coordination free online learning algorithms.",
        "doi": "10.48550/arXiv.arXiv.2206.02344",
        "publisher": "arXiv",
        "publication_date": "2022-06-06"
    },
    {
        "id": "authors:cstrb-vrf53",
        "collection": "authors",
        "collection_id": "cstrb-vrf53",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213717702",
        "type": "monograph",
        "title": "Who Leads and Who Follows in Strategic Classification?",
        "author": [
            {
                "family_name": "Zrnic",
                "given_name": "Tijana",
                "clpid": "Zrnic-Tijana"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            },
            {
                "family_name": "Jordan",
                "given_name": "Michael I.",
                "orcid": "0000-0001-8935-817X",
                "clpid": "Jordan-Michael-I"
            }
        ],
        "abstract": "As predictive models are deployed into the real world, they must increasingly contend with strategic behavior. A growing body of work on strategic classification treats this problem as a Stackelberg game: the decision-maker \"leads\" in the game by deploying a model, and the strategic agents \"follow\" by playing their best response to the deployed model. Importantly, in this framing, the burden of learning is placed solely on the decision-maker, while the agents' best responses are implicitly treated as instantaneous. In this work, we argue that the order of play in strategic classification is fundamentally determined by the relative frequencies at which the decision-maker and the agents adapt to each other's actions. In particular, by generalizing the standard model to allow both players to learn over time, we show that a decision-maker that makes updates faster than the agents can reverse the order of play, meaning that the agents lead and the decision-maker follows. We observe in standard learning settings that such a role reversal can be desirable for both the decision-maker and the strategic agents. Finally, we show that a decision-maker with the freedom to choose their update frequency can induce learning dynamics that converge to Stackelberg equilibria with either order of play.",
        "doi": "10.48550/arXiv.2106.12529",
        "publisher": "arXiv",
        "publication_date": "2021-06-23"
    },
    {
        "id": "authors:5jj0d-zdx15",
        "collection": "authors",
        "collection_id": "5jj0d-zdx15",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213714292",
        "type": "monograph",
        "title": "Zeroth-Order Methods for Convex-Concave Minmax Problems: Applications to Decision-Dependent Risk Minimization",
        "author": [
            {
                "family_name": "Maheshwari",
                "given_name": "Chinmay",
                "clpid": "Maheshwari-Chinmay"
            },
            {
                "family_name": "Chiu",
                "given_name": "Chih-Yuan",
                "clpid": "Chiu-Chih-Yuan"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian J.",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            }
        ],
        "abstract": "Min-max optimization is emerging as a key framework for analyzing problems of robustness to strategically and adversarially generated data. We propose a random reshuffling-based gradient free Optimistic Gradient Descent-Ascent algorithm for solving convex-concave min-max problems with finite sum structure. We prove that the algorithm enjoys the same convergence rate as that of zeroth-order algorithms for convex minimization problems. We further specialize the algorithm to solve distributionally robust, decision-dependent learning problems, where gradient information is not readily available. Through illustrative simulations, we observe that our proposed approach learns models that are simultaneously robust against adversarial distribution shifts and strategic decisions from the data sources, and outperforms existing methods from the strategic classification literature.",
        "doi": "10.48550/arXiv.2106.09082",
        "publisher": "arXiv",
        "publication_date": "2021-06-16"
    },
    {
        "id": "authors:mjhq5-nd880",
        "collection": "authors",
        "collection_id": "mjhq5-nd880",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213710817",
        "type": "monograph",
        "title": "Fast Distributionally Robust Learning with Variance Reduced Min-Max Optimization",
        "author": [
            {
                "family_name": "Yu",
                "given_name": "Yaodong",
                "orcid": "0000-0003-0540-8526",
                "clpid": "Yu-Yaodong"
            },
            {
                "family_name": "Lin",
                "given_name": "Tianyi",
                "orcid": "0000-0002-5323-1852",
                "clpid": "Lin-Tianyi-Darren"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Jordan",
                "given_name": "Michael I.",
                "orcid": "0000-0001-8935-817X",
                "clpid": "Jordan-Michael-I"
            }
        ],
        "abstract": "Distributionally robust supervised learning (DRSL) is emerging as a key paradigm for building reliable machine learning systems for real-world applications -- reflecting the need for classifiers and predictive models that are robust to the distribution shifts that arise from phenomena such as selection bias or nonstationarity. Existing algorithms for solving Wasserstein DRSL -- one of the most popular DRSL frameworks based around robustness to perturbations in the Wasserstein distance -- involve solving complex subproblems or fail to make use of stochastic gradients, limiting their use in large-scale machine learning problems. We revisit Wasserstein DRSL through the lens of min-max optimization and derive scalable and efficiently implementable stochastic extra-gradient algorithms which provably achieve faster convergence rates than existing approaches. We demonstrate their effectiveness on synthetic and real data when compared to existing DRSL approaches. Key to our results is the use of variance reduction and random reshuffling to accelerate stochastic min-max optimization, the analysis of which may be of independent interest.",
        "doi": "10.48550/arXiv.2104.13326",
        "publisher": "arXiv",
        "publication_date": "2021-04-27"
    },
    {
        "id": "authors:mr6zn-1js36",
        "collection": "authors",
        "collection_id": "mr6zn-1js36",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-220351518",
        "type": "monograph",
        "title": "On Thompson Sampling with Langevin Algorithms",
        "author": [
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Pacchiano",
                "given_name": "Aldo",
                "clpid": "Pacchiano-Aldo"
            },
            {
                "family_name": "Ma",
                "given_name": "Yi-an",
                "orcid": "0000-0001-6074-6638",
                "clpid": "Ma-Yi-an"
            },
            {
                "family_name": "Bartlett",
                "given_name": "Peter L.",
                "orcid": "0000-0002-8760-3140",
                "clpid": "Bartlett-Peter-L"
            },
            {
                "family_name": "Jordan",
                "given_name": "Michael I.",
                "orcid": "0000-0001-8935-817X",
                "clpid": "Jordan-Michael-I"
            }
        ],
        "abstract": "Thompson sampling for multi-armed bandit problems is known to enjoy favorable performance in both theory and practice. However, it suffers from a significant limitation computationally, arising from the need for samples from posterior distributions at every iteration. We propose two Markov Chain Monte Carlo (MCMC) methods tailored to Thompson sampling to address this issue. We construct quickly converging Langevin algorithms to generate approximate samples that have accuracy guarantees, and we leverage novel posterior concentration rates to analyze the regret of the resulting approximate Thompson sampling algorithm. Further, we specify the necessary hyperparameters for the MCMC procedure to guarantee optimal instance-dependent frequentist regret while having low computational complexity. In particular, our algorithms take advantage of both posterior concentration and a sample reuse mechanism to ensure that only a constant number of iterations and a constant amount of data is needed in each round. The resulting approximate Thompson sampling algorithm has logarithmic regret and its computational complexity does not scale with the time horizon of the algorithm.",
        "doi": "10.48550/arXiv.2002.10002",
        "publisher": "arXiv",
        "publication_date": "2020-02-23"
    },
    {
        "id": "authors:8wfkd-9vp35",
        "collection": "authors",
        "collection_id": "8wfkd-9vp35",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213700306",
        "type": "monograph",
        "title": "Policy-Gradient Algorithms Have No Guarantees of Convergence in Linear Quadratic Games",
        "author": [
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian J.",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Jordan",
                "given_name": "Michael I.",
                "orcid": "0000-0001-8935-817X",
                "clpid": "Jordan-Michael-I"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            }
        ],
        "abstract": "We show by counterexample that policy-gradient algorithms have no guarantees of even local convergence to Nash equilibria in continuous action and state space multi-agent settings. To do so, we analyze gradient-play in N-player general-sum linear quadratic games, a classic game setting which is recently emerging as a benchmark in the field of multi-agent learning. In such games the state and action spaces are continuous and global Nash equilibria can be found be solving coupled Ricatti equations. Further, gradient-play in LQ games is equivalent to multi agent policy-gradient. We first show that these games are surprisingly not convex games. Despite this, we are still able to show that the only critical points of the gradient dynamics are global Nash equilibria. We then give sufficient conditions under which policy-gradient will avoid the Nash equilibria, and generate a large number of general-sum linear quadratic games that satisfy these conditions. In such games we empirically observe the players converging to limit cycles for which the time average does not coincide with a Nash equilibrium. The existence of such games indicates that one of the most popular approaches to solving reinforcement learning problems in the classic reinforcement learning setting has no local guarantee of convergence in multi-agent settings. Further, the ease with which we can generate these counterexamples suggests that such situations are not mere edge cases and are in fact quite common.",
        "doi": "10.48550/arXiv.1907.03712",
        "publisher": "arXiv",
        "publication_date": "2019-07-08"
    },
    {
        "id": "authors:snptr-yqq39",
        "collection": "authors",
        "collection_id": "snptr-yqq39",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213656891",
        "type": "monograph",
        "title": "Convergence Analysis of Gradient-Based Learning with Non-Uniform Learning Rates in Non-Cooperative Multi-Agent Settings",
        "author": [
            {
                "family_name": "Chasnov",
                "given_name": "Benjamin",
                "orcid": "0000-0003-3484-2997",
                "clpid": "Chasnov-Benjamin"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian J.",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Burden",
                "given_name": "Samuel A.",
                "clpid": "Burden-Sameul-A"
            }
        ],
        "abstract": "Considering a class of gradient-based multi-agent learning algorithms in non-cooperative settings, we provide local convergence guarantees to a neighborhood of a stable local Nash equilibrium. In particular, we consider continuous games where agents learn in (i) deterministic settings with oracle access to their gradient and (ii) stochastic settings with an unbiased estimator of their gradient. Utilizing the minimum and maximum singular values of the game Jacobian, we provide finite-time convergence guarantees in the deterministic case. On the other hand, in the stochastic case, we provide concentration bounds guaranteeing that with high probability agents will converge to a neighborhood of a stable local Nash equilibrium in finite time. Different than other works in this vein, we also study the effects of non-uniform learning rates on the learning dynamics and convergence rates. We find that much like preconditioning in optimization, non-uniform learning rates cause a distortion in the vector field which can, in turn, change the rate of convergence and the shape of the region of attraction. The analysis is supported by numerical examples that illustrate different aspects of the theory. We conclude with discussion of the results and open questions.",
        "doi": "10.48550/arXiv.1906.00731",
        "publisher": "arXiv",
        "publication_date": "2019-05-30"
    },
    {
        "id": "authors:a8c6w-z2554",
        "collection": "authors",
        "collection_id": "a8c6w-z2554",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213653378",
        "type": "monograph",
        "title": "On Finding Local Nash Equilibria (and Only Local Nash Equilibria) in Zero-Sum Games",
        "author": [
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Jordan",
                "given_name": "Michael I.",
                "clpid": "Jordan-M-I"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            }
        ],
        "abstract": "We propose local symplectic surgery, a two-timescale procedure for finding local Nash equilibria in two-player zero-sum games. We first show that previous gradient-based algorithms cannot guarantee convergence to local Nash equilibria due to the existence of non-Nash stationary points. By taking advantage of the differential structure of the game, we construct an algorithm for which the local Nash equilibria are the only attracting fixed points. We also show that the algorithm exhibits no oscillatory behaviors in neighborhoods of equilibria and show that it has the same per-iteration complexity as other recently proposed algorithms. We conclude by validating the algorithm on two numerical examples: a toy example with multiple Nash equilibria and a non-Nash equilibrium, and the training of a small generative adversarial network (GAN).",
        "doi": "10.48550/arXiv.1901.00838",
        "publisher": "arXiv",
        "publication_date": "2019-01-03"
    },
    {
        "id": "authors:nx17y-01p34",
        "collection": "authors",
        "collection_id": "nx17y-01p34",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-213649911",
        "type": "monograph",
        "title": "A Multi-Armed Bandit Approach for Online Expert Selection in Markov Decision Processes",
        "author": [
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Dong",
                "given_name": "Roy",
                "orcid": "0000-0001-8034-4329",
                "clpid": "Dong-Roy"
            },
            {
                "family_name": "R\u00fabies Royo",
                "given_name": "Vicen\u00e7",
                "clpid": "R\u00fabies-Royo-Vicen\u00e7"
            },
            {
                "family_name": "Tomlin",
                "given_name": "Claire",
                "orcid": "0000-0003-3192-3185",
                "clpid": "Tomlin-Claire-J"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            }
        ],
        "abstract": "We formulate a multi-armed bandit (MAB) approach to choosing expert policies online in Markov decision processes (MDPs). Given a set of expert policies trained on a state and action space, the goal is to maximize the cumulative reward of our agent. The hope is to quickly find the best expert in our set. The MAB formulation allows us to quantify the performance of an algorithm in terms of the regret incurred from not choosing the best expert from the beginning. We first develop the theoretical framework for MABs in MDPs, and then present a basic regret decomposition identity. We then adapt the classical Upper Confidence Bounds algorithm to the problem of choosing experts in MDPs and prove that the expected regret grows at worst at a logarithmic rate. Lastly, we validate the theory on a small MDP.",
        "doi": "10.48550/arXiv.1707.05714",
        "publisher": "arXiv",
        "publication_date": "2017-07-18"
    }
]