[
    {
        "id": "authors:g79fh-yqz70",
        "collection": "authors",
        "collection_id": "g79fh-yqz70",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20220714-212437915",
        "type": "article",
        "title": "Langevin Monte Carlo for Contextual Bandits",
        "author": [
            {
                "family_name": "Xu",
                "given_name": "Pan",
                "clpid": "Xu-Pan"
            },
            {
                "family_name": "Zheng",
                "given_name": "Hongkai",
                "clpid": "Zheng-Hongkai"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric V.",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Azizzadenesheli",
                "given_name": "Kamyar",
                "orcid": "0000-0001-8507-1868",
                "clpid": "Azizzadenesheli-Kamyar"
            },
            {
                "family_name": "Anandkumar",
                "given_name": "Anima",
                "orcid": "0000-0002-6974-6797",
                "clpid": "Anandkumar-A"
            }
        ],
        "abstract": "We study the efficiency of Thompson sampling for contextual bandits. Existing Thompson sampling-based algorithms need to construct a Laplace approximation (i.e., a Gaussian distribution) of the posterior distribution, which is inefficient to sample in high dimensional applications for general covariance matrices. Moreover, the Gaussian approximation may not be a good surrogate for the posterior distribution for general reward generating functions. We propose an efficient posterior sampling algorithm, viz., Langevin Monte Carlo Thompson Sampling (LMC-TS), that uses Markov Chain Monte Carlo (MCMC) methods to directly sample from the posterior distribution in contextual bandits. Our method is computationally efficient since it only needs to perform noisy gradient descent updates without constructing the Laplace approximation of the posterior distribution. We prove that the proposed algorithm achieves the same sublinear regret bound as the best Thompson sampling algorithms for a special case of contextual bandits, viz., linear contextual bandits. We conduct experiments on both synthetic data and real-world datasets on different contextual bandit models, which demonstrates that directly sampling from the posterior is both computationally efficient and competitive in performance.",
        "doi": "10.48550/arXiv.arXiv.2206.11254",
        "issn": "2640-3498",
        "publisher": "ML Research Press",
        "publication": "Proceedings of Machine Learning Research",
        "publication_date": "2022-06-22",
        "volume": "162",
        "pages": "24830-24850"
    },
    {
        "id": "authors:n5mef-bvm32",
        "collection": "authors",
        "collection_id": "n5mef-bvm32",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210907-195235166",
        "type": "article",
        "title": "Convergence Analysis of Gradient-Based Learning in Continuous Games",
        "author": [
            {
                "family_name": "Chasnov",
                "given_name": "Benjamin",
                "orcid": "0000-0003-3484-2997",
                "clpid": "Chasnov-Benjamin"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Burden",
                "given_name": "Samuel",
                "clpid": "Burden-Sameul-A"
            }
        ],
        "abstract": "Considering a class of gradient-based multi-agent learning algorithms in non-cooperative settings, we provide convergence guarantees to a neighborhood of a stable Nash equilibrium. In particular, we consider continuous games where agents learn in 1) deterministic settings with oracle access to their gradient and 2) stochastic settings with an unbiased estimator of their gradient. We also study the effects of non-uniform learning rates, which causes a distortion of the vector field that can alter which equilibrium the agents converge to and the path they take. We support the analysis with numerical examples that provide insight into how one might synthesize games to achieve desired equilibria.",
        "issn": "2640-3498",
        "publisher": "ML Research Press",
        "publication": "Proceedings of Machine Learning Research",
        "publication_date": "2020-08-06",
        "volume": "115",
        "pages": "935-944"
    },
    {
        "id": "authors:41arc-vy622",
        "collection": "authors",
        "collection_id": "41arc-vy622",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210903-222215724",
        "type": "article",
        "title": "Inverse Risk-Sensitive Reinforcement Learning",
        "author": [
            {
                "family_name": "Ratliff",
                "given_name": "Lillian J.",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            }
        ],
        "abstract": "This work addresses the problem of inverse reinforcement learning in Markov decision processes where the decision-making agent is risk-sensitive. In particular, a risk-sensitive reinforcement learning algorithm with convergence guarantees that makes use of coherent risk metrics and models of human decision-making which have their origins in behavioral psychology and economics is presented. The risk-sensitive reinforcement learning algorithm provides the theoretical underpinning for a gradient-based inverse reinforcement learning algorithm that seeks to minimize a loss function defined on the observed behavior. It is shown that the gradient of the loss function with respect to the model parameters is well defined and computable via a contraction map argument. Evaluation of the proposed technique is performed on a Grid World example, a canonical benchmark problem.",
        "doi": "10.1109/TAC.2019.2926674",
        "issn": "0018-9286",
        "publisher": "IEEE",
        "publication": "IEEE Transactions on Automatic Control",
        "publication_date": "2020-03",
        "series_number": "3",
        "volume": "65",
        "issue": "3",
        "pages": "1256-1263"
    },
    {
        "id": "authors:8a04w-etx35",
        "collection": "authors",
        "collection_id": "8a04w-etx35",
        "cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20210907-200115513",
        "type": "article",
        "title": "On Gradient-Based Learning in Continuous Games",
        "author": [
            {
                "family_name": "Mazumdar",
                "given_name": "Eric",
                "orcid": "0000-0002-1815-269X",
                "clpid": "Mazumdar-Eric"
            },
            {
                "family_name": "Ratliff",
                "given_name": "Lillian J.",
                "orcid": "0000-0001-8936-0229",
                "clpid": "Ratliff-Lillian-J"
            },
            {
                "family_name": "Sastry",
                "given_name": "S. Shankar",
                "clpid": "Sastry-S-Shankar"
            }
        ],
        "abstract": "We introduce a general framework for competitive gradient-based learning that encompasses a wide breadth of multiagent learning algorithms, and analyze the limiting behavior of competitive gradient-based learning algorithms using dynamical systems theory. For both general-sum and potential games, we characterize a nonnegligible subset of the local Nash equilibria that will be avoided if each agent employs a gradient-based learning algorithm. We also shed light on the issue of convergence to non-Nash strategies in general- and zero-sum games, which may have no relevance to the underlying game, and arise solely due to the choice of algorithm. The existence and frequency of such strategies may explain some of the difficulties encountered when using gradient descent in zero-sum games as, e.g., in the training of generative adversarial networks. To reinforce the theoretical contributions, we provide empirical results that highlight the frequency of linear quadratic dynamic games (a benchmark for multiagent reinforcement learning) that admit global Nash equilibria that are almost surely avoided by policy gradient.",
        "doi": "10.1137/18m1231298",
        "issn": "2577-0187",
        "publisher": "Society for Industrial & Applied Mathematics",
        "publication": "SIAM Journal on Mathematics of Data Science",
        "publication_date": "2020-02-18",
        "series_number": "1",
        "volume": "2",
        "issue": "1",
        "pages": "103-131"
    }
]