@phdthesis{10.7907/cdf6-0w78, author = {Li, Tongxin}, title = {Learning-Augmented Control and Decision-Making: Theory and Applications in Smart Grids}, school = {California Institute of Technology}, year = {2023}, doi = {10.7907/cdf6-0w78}, url = {https://resolver.caltech.edu/CaltechThesis:07202022-040725024}, abstract = {
Achieving carbon neutrality by 2050 does not only lead to the increasing penetration of renewable energy, but also an explosive growth of smart meter data. Recently, augmenting classical methods in real-world cyber-physical systems such as smart grids with black-box AI tools, forecasts, and ML algorithms has attracted a lot of growing interest. Integrating AI techniques into smart grids, on the one hand, provides a new approach to handle the uncertainties caused by renewable resources and human behaviors, but on the other hand, creates practical issues such as reliability, stability, privacy, and scalability, etc. to the AI-integrated algorithms.
This dissertation focuses on solving problems raised in designing learning-augmented control and decision-making algorithms.
The results presented in this dissertation are three-fold. We first study a problem in linear quadratic control, where imperfect/untrusted AI predictions of system perturbations are available. We show that it is possible to design a learning-augmented algorithm with performance guarantees that is aggressive if the predictions are accurate and conservative if they are imperfect. Machine-learned black-box policies are ubiquitous for nonlinear control problems. Meanwhile, crude model information is often available for these problems from, e.g., linear approximations of nonlinear dynamics. We next study the problem of equipping a black-box control policy with model-based advice for nonlinear control on a single trajectory. We first show a general negative result that a naive convex combination of a black-box policy and a linear model-based policy can lead to instability, even if the two policies are both stabilizing. We then propose an adaptive λ-confident policy, with a coefficient λ indicating the confidence in a black-box policy, and prove its stability. With bounded nonlinearity, in addition, we show that the adaptive λ-confident policy achieves a bounded competitive ratio when a black-box policy is near-optimal. Finally, we propose an online learning approach to implement the adaptive λ-confident policy and verify its efficacy in case studies about the Cart-Pole problem and a real-world electric vehicle (EV) charging problem with data bias due to COVID-19.
Aggregators have emerged as crucial tools for the coordination of distributed, controllable loads. To be used effectively, an aggregator must be able to communicate the available flexibility of the loads they control, known as the aggregate flexibility to a system operator. However, most existing aggregate flexibility measures often are slow-timescale estimations and much less attention has been paid to real-time coordination between an aggregator and an operator. In the second part of this dissertation, we consider solving an online decision-making problem in a closed-loop system and present a design of real-time aggregate flexibility feedback, termed the maximum entropy feedback (MEF). In addition to deriving analytic properties of the MEF, combining learning and control, we show that it can be approximated using reinforcement learning and used as a penalty term in a novel control algorithm–the penalized predictive control (PPC) that enables efficient communication, fast computation, and lower costs. We illustrate the efficacy of the PPC using a dataset from an adaptive electric vehicle charging network and show that PPC outperforms classical MPC. We show that under certain regularity assumptions, the PPC is optimal. We illustrate the efficacy of the PPC using a dataset from an adaptive electric vehicle charging network and show that PPC outperforms classical model predictive control (MPC). In a theoretical perspective, a two-controller problem is formulated. A central controller chooses an action from a feasible set that is determined by time-varying and coupling constraints, which depend on all past actions and states. The central controller’s goal is to minimize the cumulative cost; however, the controller has access to neither the feasible set nor the dynamics directly, which are determined by a remote local controller. Instead, the central controller receives only an aggregate summary of the feasibility information from the local controller, which does not know the system costs. We show that it is possible for an online algorithm using feasibility information to nearly match the dynamic regret of an online algorithm using perfect information whenever the feasible sets satisfy some criterion, which is satisfied by inventory and tracking constraints.
The third part of this dissertation consists of examples of learning, inference, and data analysis methods for power system identification and electric charging. We present a power system identification problem with noisy nodal measurements and efficient algorithms, based on fundamental trade-offs between the number of measurements, the complexity of the graph class, and the probability of error. Next, we specifically consider prediction and unsupervised learning tasks in EV charging. We provide basic data analysis results of a public dataset released by Caltech and develop a novel iterative clustering method for classifying time series of EV charging rates.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Low, Steven H.}, } @phdthesis{10.7907/8817-xy25, author = {Liang, Chen}, title = {Cascading Failures in Power Systems: Modeling, Characterization, and Mitigation}, school = {California Institute of Technology}, year = {2022}, doi = {10.7907/8817-xy25}, url = {https://resolver.caltech.edu/CaltechTHESIS:06032022-035416994}, abstract = {Reliability is a critical goal for power systems. Due to the connectivity of power grids, an initial failure may trigger a cascade of failures and eventually lead to a large-scale blackout, causing significant economic and social impacts. Cascading failure analysis thus draws wide attention from power system practitioners and researchers. A well-known observation is that cascading failures in power systems propagate non-locally because of the complex mechanism of power grids. Such non-local propagation makes it particularly challenging to model, analyze and control the failure process. In this thesis, we tackle these challenges by establishing a mathematical theory to model and characterize failure patterns, discover structural properties of failure propagation, and design novel techniques for failure mitigation.
First, we propose a failure propagation model considering both fast-timescale system frequency dynamics and the slow-timescale line tripping process. This model provides mathematical justifications to the widely used static DC model and can be generalized to capture a variety of failure propagation patterns induced by different control mechanisms of the power grid. More importantly, this model provides flexibility to design real-time control algorithms for failure mitigation and localization.
Second, we provide a complete characterization of line failures under the static DC model. Our results unveil a deep connection between the power redistribution patterns and the network block decomposition. More specifically, we show that a non-cut line failure in a block will only impact the branch power flows on the transmission lines within the block. In contrast, a cut set line failure will propagate globally depending on both the power balancing rules and the network topological structure. Further, we discuss three types of interface networks to connect the sub-grids, all achieving better failure localization performance.
Third, we study corrective control algorithms for failure mitigation. We integrate a distributed frequency control strategy with the network block decomposition to provide provable failure mitigation and localization guarantees on line failures. This strategy operates on the frequency control timescale and supplements existing corrective mechanisms, improving grid reliability and operational efficiency. We further explore the failure mitigation approach with direct post-contingency injection adjustments. Specifically, we propose an optimization-based control method with strong structural properties, which is highly desirable in large-scale power networks.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Low, Steven H. and Wierman, Adam C.}, } @phdthesis{10.7907/vn9x-xj10, author = {Murray, Riley John}, title = {Applications of Convex Analysis to Signomial and Polynomial Nonnegativity Problems}, school = {California Institute of Technology}, year = {2021}, doi = {10.7907/vn9x-xj10}, url = {https://resolver.caltech.edu/CaltechTHESIS:05202021-194439071}, abstract = {Here is a question that is easy to state, but often hard to answer:
Is this function nonnegative on this set?
When faced with such a question, one often makes appeals to known inequalities. One crafts arguments that are sufficient to establish the nonnegativity of the function, rather than determining the function’s precise range of values. This thesis studies sufficient conditions for nonnegativity of signomials and polynomials. Conceptually, signomials may be viewed as generalized polynomials that feature arbitrary real exponents, but with variables restricted to the positive orthant.
Our methods leverage efficient algorithms for a type of convex optimization known as relative entropy programming (REP). By virtue of this integration with REP, our methods can help answer questions like the following:
Is there some function, in this particular space of functions, that is nonnegative on this set?
The ability to answer such questions is extremely useful in applied mathematics. Alternative approaches in this same vein (e.g., methods for polynomials based on semidefinite programming) have been used successfully as convex relaxation frameworks for nonconvex optimization, as mechanisms for analyzing dynamical systems, and even as tools for solving nonlinear partial differential equations.
This thesis builds from the sums of arithmetic-geometric exponentials or SAGE approach to signomial nonnegativity. The term “exponential” appears in the SAGE acronym because SAGE parameterizes signomials in terms of exponential functions.
Our first round of contributions concern the original SAGE approach. We employ basic techniques in convex analysis and convex geometry to derive structural results for spaces of SAGE signomials and exactness results for SAGE-based REP relaxations of nonconvex signomial optimization problems. We frame our analysis primarily in terms of the coefficients of a signomial’s basis expansion rather than in terms of signomials themselves. The effect of this framing is that our results for signomials readily transfer to polynomials. In particular, we are led to define a new concept of SAGE polynomials. For sparse polynomials, this method offers an exponential efficiency improvement relative to certificates of nonnegativity obtained through semidefinite programming.
We go on to create the conditional SAGE methodology for exploiting convex substructure in constrained signomial nonnegativity problems. The basic insight here is that since the standard relative entropy representation of SAGE signomials is obtained by a suitable application of convex duality, we are free to add additional convex constraints into the duality argument. In the course of explaining this idea we provide some illustrative examples in signomial optimization and analysis of chemical dynamics.
The majority of this thesis is dedicated to exploring fundamental questions surrounding conditional SAGE signomials. We approach these questions through analysis frameworks of sublinear circuits and signomial rings. These sublinear circuits generalize simplicial circuits of affine-linear matroids, and lead to rich modes of analysis for sets that are simultaneously convex in the usual sense and convex under a logarithmic transformation. The concept of signomial rings lets us develop a powerful signomial Positivstellensatz and an elementary signomial moment theory. The Positivstellensatz provides for an effective hierarchy of REP relaxations for approaching the value of a nonconvex signomial minimization problem from below, as well as a first-of-its-kind hierarchy for approaching the same value from above.
In parallel with our mathematical work, we have developed the sageopt python package. Sageopt drives all the examples and experiments used throughout this thesis, and has been used by engineers to solve high-degree polynomial optimization problems at scales unattainable by alternative methods. We conclude this thesis with an explanation of how our theoretical results affected sageopt’s design.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, } @phdthesis{10.7907/db29-am33, author = {London, Palma Alise den Nijs}, title = {Frameworks for High Dimensional Convex Optimization}, school = {California Institute of Technology}, year = {2021}, doi = {10.7907/db29-am33}, url = {https://resolver.caltech.edu/CaltechTHESIS:08162020-233437139}, abstract = {We present novel, efficient algorithms for solving extremely large optimization problems. A significant bottleneck today is that as the size of datasets grow, researchers across disciplines desire to solve prohibitively massive optimization problems. In this thesis, we present methods to compress optimization problems. The general goal is to represent a huge problem as a smaller problem or set of smaller problems, while still retaining enough information to ensure provable guarantees on solution quality and run time. We apply this approach to the following three settings.
First, we propose a framework for accelerating both linear program solvers and convex solvers for problems with linear constraints. Our focus is on a class of problems for which data is either very costly, or hard to obtain. In these situations, the number of data points m available is much smaller than the number of variables, n. In a machine learning setting, this regime is increasingly prevalent since it is often advantageous to consider larger and larger feature spaces, while not necessarily obtaining proportionally more data. Analytically, we provide worst-case guarantees on both the runtime and the quality of the solution produced. Empirically, we show that our framework speeds up state-of-the-art commercial solvers by two orders of magnitude, while maintaining a near-optimal solution.
Second, we propose a novel approach for distributed optimization which uses far fewer messages than existing methods. We consider a setting in which the problem data are distributed over the nodes. We provide worst-case guarantees on the performance with respect to the amount of communication it requires and the quality of the solution. The algorithm uses O(log(n+m)) messages with high probability. We note that this is an exponential reduction compared to the O(n) communication required during each round of traditional consensus based approaches. In terms of solution quality, our algorithm produces a feasible, near optimal solution. Numeric results demonstrate that the approximation error matches that of ADMM in many cases, while using orders-of-magnitude less communication.
Lastly, we propose and analyze a provably accurate long-step infeasible Interior Point Algorithm (IPM) for linear programming. The core computational bottleneck in IPMs is the need to solve a linear system of equations at each iteration. We employ sketching techniques to make the linear system computation lighter, by handling well-known ill-conditioning problems that occur when using iterative solvers in IPMs for LPs. In particular, we propose a preconditioned Conjugate Gradient iterative solver for the linear system. Our sketching strategy makes the condition number of the preconditioned system provably small. In practice we demonstrate that our approach significantly reduces the condition number of the linear system, and thus allows for more efficient solving on a range of benchmark datasets.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/j494-j572, author = {Azizan Ruhi, Navid}, title = {Large-Scale Intelligent Systems: From Network Dynamics to Optimization Algorithms}, school = {California Institute of Technology}, year = {2021}, doi = {10.7907/j494-j572}, url = {https://resolver.caltech.edu/CaltechTHESIS:11182020-210226861}, abstract = {
The expansion of large-scale technological systems such as electrical grids, transportation networks, health care systems, telecommunication networks, the Internet (of things), and other societal networks has created numerous challenges and opportunities at the same time. These systems are often not yet as robust, efficient, sustainable, or smart as we would want them to be. Fueled by the massive amounts of data generated by all these systems, and with the recent advances in making sense out of data, there is a strong desire to make them more intelligent. However, developing large-scale intelligent systems is a multifaceted problem, involving several major challenges. First, large-scale systems typically exhibit complex dynamics due to the large number of entities interacting over a network. Second, because the system is composed of many interacting entities, that make decentralized (and often self-interested) decisions, one has to properly design incentives and markets for such systems. Third, the massive computational needs caused by the scale of the system necessitate performing computations in a distributed fashion, which in turn requires devising new algorithms. Finally, one has to create algorithms that can learn from the copious amounts of data and generalize well. This thesis makes several contributions related to each of these four challenges.
Analyzing and understanding the network dynamics exhibited in societal systems is crucial for developing systems that are robust and efficient. In Part I of this thesis, we study one of the most important families of network dynamics, namely, that of epidemics, or spreading processes. Studying such processes is relevant for understanding and controlling the spread of, e.g., contagious diseases among people, ideas or fake news in online social networks, computer viruses in computer networks, or cascading failures in societal networks. We establish several results on the exact Markov chain model and the nonlinear “mean-field” approximations for various kinds of epidemics (i.e., SIS, SIRS, SEIRS, SIV, SEIV, and their variants).
Designing incentives and markets for large-scale systems is critical for their efficient operation and ensuring an alignment between the agents’ decentralized decisions and the global goals of the system. To that end, in Part II of this thesis, we study these issues in markets with non-convex costs as well as networked markets, which are of vital importance for, e.g., the smart grid. We propose novel pricing schemes for such markets, which satisfy all the desired market properties. We also reveal issues in the current incentives for distributed energy resources, such as renewables, and design optimization algorithms for efficient management of aggregators of such resources.
With the growing amounts of data generated by large-scale systems, and the fact that the data may already be dispersed across many units, it is becoming increasingly necessary to run computational tasks in a distributed fashion. Part III concerns developing algorithms for distributed computation. We propose a novel consensus-based algorithm for the task of solving large-scale systems of linear equations, which is one of the most fundamental problems in linear algebra, and a key step at the heart of many algorithms in scientific computing, machine learning, and beyond. In addition, in order to deal with the issue of heterogeneous delays in distributed computation, caused by slow machines, we develop a new coded computation technique. In both cases, the proposed methods offer significant speed-ups relative to the existing approaches.
Over the past decade, deep learning methods have become the most successful learning algorithms in a wide variety of tasks. However, the reasons behind their success (as well as their failures in some respects) are largely unexplained. It is widely believed that the success of deep learning is not just due to the deep architecture of the models, but also due to the behavior of the optimization algorithms, such as stochastic gradient descent (SGD), used for training them. In Part IV of this thesis, we characterize several properties, such as minimax optimality and implicit regularization, of SGD, and more generally, of the family of stochastic mirror descent (SMD). While SGD performs an implicit regularization, this regularization can be effectively controlled using SMD with a proper choice of mirror, which in turn can improve the generalization error.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C. and Hassibi, Babak}, } @phdthesis{10.7907/bc2t-er54, author = {Su, Yu}, title = {Optimizing Cloud AI Platforms: Resource Allocation and Market Design}, school = {California Institute of Technology}, year = {2021}, doi = {10.7907/bc2t-er54}, url = {https://resolver.caltech.edu/CaltechTHESIS:06072021-172842077}, abstract = {The numerous applications of data-driven algorithms and tools across diverse industries have led to tremendous successes in recent years. As the volume of massive data that is created, collected, and consumed continues to grow, there are many new imposed challenges faced by today’s cloud AI platforms that support the deployment of machine learning algorithms on a large scale. In this thesis, we tackle the emerging challenges within cloud AI systems and beyond by adopting approaches from the fields of resource allocation and market design.
First, we propose a new scheduler, Generalized Earliest Time First (GETF), and provide the provable, worst-case approximation guarantees for the goals of minimizing both makespan and total weighted completion time of tasks with precedence constraints on related machines with machine-dependent communication times. These two results address long-standing open problems. Further, we adopt the classic speed scaling function to model power consumption and use mean response time to measure the performance. We propose the concept of pseudo-size to quantify importance of tasks and design a family of two-stage scheduling frameworks based on the approximation of pseudo-size. Assuming a good approximation of pseudo-size, we are able to provide the first provable bound of a linear combination of performance and energy goals under this setting.
Second, we study the design of mechanisms for data acquisition in settings with information leakage and verifiable data. We provide the first characterization of an optimal mechanism for data acquisition if agents are concerned about privacy and their data is correlated with each other. Additionally, the mechanism allows, for the first time, a trade-off between the bias and variance of the estimator. Transitioning from the data market into the energy market, we propose a new pricing scheme, which is applicable to general non-convex costs, and allows using general parametric pricing functions. Optimizing for the quantities and the price parameters simultaneously, and the ability to use general parametric pricing functions allows our scheme to find prices that are typically economically more efficient and less discriminatory than those of the existing schemes while still supporting a competitive equilibrium. In addition, we supplement the proposed method with a computationally efficient polynomial-time approximation algorithm, which can be used to approximate the optimal quantities and prices for general non-convex cost functions.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/XZHX-1M46, author = {Ziani, Juba}, title = {Data: Implications for Markets and for Society}, school = {California Institute of Technology}, year = {2019}, doi = {10.7907/XZHX-1M46}, url = {https://resolver.caltech.edu/CaltechTHESIS:05292019-162418941}, abstract = {Every day, massive amounts of data are gathered, exchanged, and used to run statistical computations, train machine learning algorithms, and inform decisions on individuals and populations. The quick rise of data, the need to exchange and process it, to take data privacy concerns into account, and to understand how it affects decision-making, introduce many new and interesting economic, game theoretic, and algorithmic challenges.
The goal of this thesis is to provide theoretical foundations to approach these challenges. The first part of this thesis focuses on the design of mechanisms that purchase then aggregate data from many sources, in order to perform statistical tasks. The second part of this thesis revolves around the societal concerns associated with the use of individuals’ data. The first such concern we examine is that of privacy, when using sensitive data about individuals in statistical computations; we focus our attention on how privacy constraints interact with the task of designing mechanisms for acquisition and aggregation of sensitive data. The second concern we focus on is that of fairness in decision-making: we aim to provide tools to society that help prevent discrimination against individuals and populations based on sensitive attributes in their data, when making important decisions about them. Finally, we end this thesis on a study of the interactions between data and strategic behavior. There, we see data as a source of information that informs and affects agents’ incentives; we study how information revelation impacts agent behavior in auctions, and in turn how a seller should design auctions that take such information revelation into account.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Ligett, Katrina A.}, } @phdthesis{10.7907/XY8M-8D94, author = {Pang, John Zhen Fu}, title = {Online Platforms in Networked Markets: Transparency, Anticipation and Demand Management}, school = {California Institute of Technology}, year = {2019}, doi = {10.7907/XY8M-8D94}, url = {https://resolver.caltech.edu/CaltechTHESIS:03132019-143428796}, abstract = {The global economy has been transformed by the introduction of online platforms in the past two decades. These companies, such as Uber and Amazon, have benefited and undergone massive growth, and are a critical part of the world economy today. Understanding these online platforms, their designs and how participation change with anticipation and uncertainty can help us identify the necessary ingredients for successful implementation of online platforms in the future, especially for those with underlying network constraints, e.g., the electricity grid.
This thesis makes three main contributions. First, we identify and compare common access and allocation control designs for online platforms, and highlight their trade-offs between transparency and control. We make these comparisons under a networked Cournot competition model and consider three popular designs: (i) open access, (ii) discriminatory access, and (iii) controlled allocation. Our findings reveal that designs that control over access are more efficient than designs that control over allocations, but open access designs are susceptible to substantial search costs. Next, we study the impact of demand management in a networked Stackelberg model considering network constraints and producer anticipation. We provide insights on limiting manipulation under these constrained networked marketplaces with nodal prices, and show that demand management mechanisms that traditionally aid system stability also help plays a vital role economically. In particular, we show that demand management empower consumers and give them “market power” to counter that of producers, limiting the impact of their anticipation and their potential for manipulation. Lastly, we study how participants (e.g., drivers on Uber) make competitive real-time production (driving) decisions. To that end, we design a novel pursuit algorithm for making online optimization under limited inventory constraints. Our analysis yields an algorithm that is competitive and applicable to achieve optimal results in the well known one-way trading problem, and new variants of the original problem.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/Z95M63W4, author = {Chen, Niangjun}, title = {Online Algorithms: From Prediction to Decision}, school = {California Institute of Technology}, year = {2018}, doi = {10.7907/Z95M63W4}, url = {https://resolver.caltech.edu/CaltechTHESIS:10182017-210853845}, abstract = {Making use of predictions is a crucial, but under-explored, area of sequential decision problems with limited information. While in practice most online algorithms rely on predictions to make real time decisions, in theory their performance is only analyzed in simplified models of prediction noise, either adversarial or i.i.d. The goal of this thesis is to bridge this divide between theory and practice: to study online algorithm under more practical predictions models, gain better understanding about the value of prediction, and design online algorithms that make the best use of predictions.
This thesis makes three main contributions. First, we propose a stochastic prediction error model that generalizes prior models in the learning and stochastic control communities, incorporates correlation among prediction errors, and captures the fact that predictions improve as time passes. Using this general prediction model, we prove that Averaging Fixed Horizon Control (AFHC) can simultaneously achieve sublinear regret and constant competitive ratio in expectation using only a constant- sized prediction window, overcoming the hardnesss results in adversarial prediction models. Second, to understand the optimal use of noisy prediction, we introduce a new class of policies, Committed Horizon Control (CHC), that generalizes both popular policies Receding Horizon Control (RHC) and Averaging Fixed Horizon Control (AFHC). Our results provide explicit results characterizing the optimal use of prediction in CHC policy as a function of properties of the prediction noise, e.g., variance and correlation structure. Third, we apply the general prediction model and algorithm design framework to the deferrable load control problem in power systems. Our proposed model predictive algorithm provides significant reduction in variance of total load in the power system. Throughout this thesis, we provide both average-case analysis and concentration results for our proposed online algorithms, highlighting that the typical performance is tightly concentrated around the average-case performance.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/K62Y-FV39, author = {Ren, Xiaoqi}, title = {Optimizing Resource Management in Cloud Analytics Services}, school = {California Institute of Technology}, year = {2018}, doi = {10.7907/K62Y-FV39}, url = {https://resolver.caltech.edu/CaltechTHESIS:05312018-080301508}, abstract = {The fundamental challenge in the cloud today is how to build and optimize machine learning and data analytical services. Machine learning and data analytical platforms are changing computing infrastructure from expensive private data centers to easily accessible online services. These services pack user requests as jobs and run them on thousands of machines in parallel in geo-distributed clusters. The scale and the complexity of emerging jobs lead to increasing challenges for the clusters at all levels, from power infrastructure to system architecture and corresponding software framework design.
These challenges come in many forms. Today’s clusters are built on commodity hardware and hardware failures are unavoidable. Resource competition, network congestion, and mixed generations of hardware make the hardware environment complex and hard to model and predict. Such heterogeneity becomes a crucial roadblock for efficient parallelization on both the task level and job level. Another challenge comes from the increasing complexity of the applications. For example, machine learning services run jobs made up of multiple tasks with complex dependency structures. This complexity leads to difficulties in framework designs. The scale, especially when services span geo-distributed clusters, leads to another important hurdle for cluster design. Challenges also come from the power infrastructure. Power infrastructure is very expensive and accounts for more than 20% of the total costs to build a cluster. Power sharing optimization to maximize the facility utilization and smooth peak hour usages is another roadblock for cluster design.
In this thesis, we focus on solutions for these challenges at the task level, on the job level, with respect to the geo-distributed data cloud design and for power management in colocation data centers.
At the task level, a crucial hurdle to achieving predictable performance is stragglers, i.e., tasks that take significantly longer than expected to run. At this point, speculative execution has been widely adopted to mitigate the impact of stragglers in simple workloads. We apply straggler mitigation for approximation jobs for the first time. We present GRASS, which carefully uses speculation to mitigate the impact of stragglers in approximation jobs. GRASS’s design is based on the analysis of a model we develop to capture the optimal speculation levels for approximation jobs. Evaluations with production workloads from Facebook and Microsoft Bing in an EC2 cluster of 200 nodes show that GRASS increases accuracy of deadline-bound jobs by 47% and speeds up error-bound jobs by 38%.
Moving from task level to job level, task level speculation mechanisms are designed and operated independently of job scheduling when, in fact, scheduling a speculative copy of a task has a direct impact on the resources available for other jobs. Thus, we present Hopper, a job-level speculation-aware scheduler that integrates the tradeoffs associated with speculation into job scheduling decisions based on a model generalized from the task-level speculation model. We implement both centralized and decentralized prototypes of the Hopper scheduler and show that 50% (66%) improvements over state-of-the-art centralized (decentralized) schedulers and speculation strategies can be achieved through the coordination of scheduling and speculation.
As computing resources move from local clusters to geo-distributed cloud services, we are expecting the same transformation for data storage. We study two crucial pieces of a geo-distributed data cloud system: data acquisition and data placement. Starting from developing the optimal algorithm for the case of a data cloud made up of a single data center, we propose a near-optimal, polynomial-time algorithm for a geo-distributed data cloud in general. We show, via a case study, that the resulting design, Datum, is near-optimal (within 1.6%) in practical settings.
Efficient power management is a fundamental challenge for data centers when providing reliable services. Power oversubscription in data centers is very common and may occasionally trigger an emergency when the aggregate power demand exceeds the capacity. We study power capping solutions for handling such emergencies in a colocation data center, where the operator supplies power to multiple tenants. We propose a novel market mechanism based on supply function bidding, called COOP, to financially incentivize and coordinate tenants’ power reduction for minimizing total performance loss while satisfying multiple power capping constraints. We demonstrate that COOP is “win-win”, increasing the operator’s profit (through oversubscription) and reducing tenants’ costs (through financial compensation for their power reduction during emergencies).
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @mastersthesis{10.7907/Z9SX6B76, author = {London, Palma Alise den Nijs}, title = {Distributed Optimization and Data Market Design}, school = {California Institute of Technology}, year = {2017}, doi = {10.7907/Z9SX6B76}, url = {https://resolver.caltech.edu/CaltechTHESIS:05092017-112643697}, abstract = {We consider algorithms for distributed optimization and their applications. In this thesis, we propose a new approach for distributed optimization based on an emerging area of theoretical computer science – local computation algorithms. The approach is fundamentally different from existing methodologies and provides a number of benefits, such as robustness to link failure and adaptivity to dynamic settings. Specifically, we develop an algorithm, LOCO, that given a convex optimization problem P with n variables and a “sparse” linear constraint matrix with m constraints, provably finds a solution as good as that of the best online algorithm for P using only O(log(n + m)) messages with high probability. The approach is not iterative and communication is restricted to a localized neighborhood. In addition to analytic results, we show numerically that the performance improvements over classical approaches for distributed optimization are significant, e.g., it uses orders of magnitude less communication than ADMM.
We also consider the operations of a geographically distributed cloud data market. We consider design decisions that include which data to purchase (data purchasing) and where to place or replicate the data for delivery (data placement). We show that a joint approach to data purchasing and data placement within a cloud data market improves operating costs. This problem can be viewed as a facility location problem, and is thus NP-hard. However, we give a provably optimal algorithm for the case of a data market consisting of a single data center, and then generalize the result from the single data center setting in order to develop a near-optimal, polynomial-time algorithm for a geo-distributed data market. The resulting design, Datum, decomposes the joint purchasing and placement problem into two subproblems, one for data purchasing and one for data placement, using a transformation of the underlying bandwidth costs. We show, via a case study, that Datum is near-optimal (within 1.6%) in practical settings.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, } @phdthesis{10.7907/Z9BG2KZG, author = {Cai, Wuhan Desmond}, title = {Electricity Markets for the Smart Grid: Networks, Timescales, and Integration with Control}, school = {California Institute of Technology}, year = {2016}, doi = {10.7907/Z9BG2KZG}, url = {https://resolver.caltech.edu/CaltechTHESIS:05262016-112813537}, abstract = {We are at the dawn of a significant transformation in the electric industry. Renewable generation and customer participation in grid operations and markets have been growing at tremendous rates in recent years and these trends are expected to continue. These trends are likely to be accompanied by both engineering and market integration challenges. Therefore, to incorporate these resources efficiently into the grid, it is important to deal with the inefficiencies in existing markets. The goal of this thesis is to contribute new insights towards improving the design of electricity markets.
This thesis makes three main contributions. First, we provide insights into how the economic dispatch mechanism could be designed to account for price-anticipating participants. We study this problem in the context of a networked Cournot competition with a market maker and we give an algorithm to find improved market clearing designs. Our findings illustrate the potential inefficiencies in existing markets and provides a framework for improving the design of the markets. Second, we provide insights into the strategic interactions between generation flexibility and forward markets. Our key insight is an observation that spot market capacity constraints can significantly impact the efficiency and existence of equilibrium in forward markets, as they give producers incentives to strategically withhold offers from the markets. Third, we provide insights into how optimization decomposition theory can guide optimal design of the architecture of power systems control. In particular, we illustrate a context where decomposition theory enables us to jointly design market and control mechanisms to allocate resources efficiently across both the economic dispatch and frequency regulation timescales.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Low, Steven H.}, } @mastersthesis{10.7907/Z9R49NRJ, author = {Ren, Xiaoqi}, title = {Speculation-Aware Resource Allocation for Cluster Schedulers}, school = {California Institute of Technology}, year = {2015}, doi = {10.7907/Z9R49NRJ}, url = {https://resolver.caltech.edu/CaltechTHESIS:09252014-063715278}, abstract = {Real-time demand response is essential for handling the uncertainties of renewable generation. Traditionally, demand response has been focused on large industrial and commercial loads, however it is expected that a large number of small residential loads such as air conditioners, dish washers, and electric vehicles will also participate in the coming years. The electricity consumption of these smaller loads, which we call deferrable loads, can be shifted over time, and thus be used (in aggregate) to compensate for the random fluctuations in renewable generation.
In this thesis, we propose a real-time distributed deferrable load control algorithm to reduce the variance of aggregate load (load minus renewable generation) by shifting the power consumption of deferrable loads to periods with high renewable generation. The algorithm is model predictive in nature, i.e., at every time step, the algorithm minimizes the expected variance to go with updated predictions. We prove that suboptimality of this model predictive algorithm vanishes as time horizon expands in the average case analysis. Further, we prove strong concentration results on the distribution of the load variance obtained by model predictive deferrable load control. These concentration results highlight that the typical performance of model predictive deferrable load control is tightly concentrated around the average-case performance. Finally, we evaluate the algorithm via trace-based simulations.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/FRGW-AF26, author = {Bose, Subhonmesh}, title = {An Integrated Design Approach to Power Systems: From Power Flows to Electricity Markets}, school = {California Institute of Technology}, year = {2014}, doi = {10.7907/FRGW-AF26}, url = {https://resolver.caltech.edu/CaltechTHESIS:06012014-040224456}, abstract = {Power system is at the brink of change. Engineering needs, economic forces and environmental factors are the main drivers of this change. The vision is to build a smart electrical grid and a smarter market mechanism around it to fulfill mandates on clean energy. Looking at engineering and economic issues in isolation is no longer an option today; it needs an integrated design approach. In this thesis, I shall revisit some of the classical questions on the engineering operation of power systems that deals with the nonconvexity of power flow equations. Then I shall explore some issues of the interaction of these power flow equations on the electricity markets to address the fundamental issue of market power in a deregulated market environment. Finally, motivated by the emergence of new storage technologies, I present an interesting result on the investment decision problem of placing storage over a power network. The goal of this study is to demonstrate that modern optimization and game theory can provide unique insights into this complex system. Some of the ideas carry over to applications beyond power systems.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Low, Steven H.}, } @mastersthesis{10.7907/493H-BD56, author = {Chen, Niangjun}, title = {Model Predictive Control for Deferrable Loads Scheduling}, school = {California Institute of Technology}, year = {2014}, doi = {10.7907/493H-BD56}, url = {https://resolver.caltech.edu/CaltechTHESIS:06022014-205438709}, abstract = {Real-time demand response is essential for handling the uncertainties of renewable generation. Traditionally, demand response has been focused on large industrial and commercial loads, however it is expected that a large number of small residential loads such as air conditioners, dish washers, and electric vehicles will also participate in the coming years. The electricity consumption of these smaller loads, which we call deferrable loads, can be shifted over time, and thus be used (in aggregate) to compensate for the random fluctuations in renewable generation.
In this thesis, we propose a real-time distributed deferrable load control algorithm to reduce the variance of aggregate load (load minus renewable generation) by shifting the power consumption of deferrable loads to periods with high renewable generation. The algorithm is model predictive in nature, i.e., at every time step, the algorithm minimizes the expected variance to go with updated predictions. We prove that suboptimality of this model predictive algorithm vanishes as time horizon expands in the average case analysis. Further, we prove strong concentration results on the distribution of the load variance obtained by model predictive deferrable load control. These concentration results highlight that the typical performance of model predictive deferrable load control is tightly concentrated around the average-case performance. Finally, we evaluate the algorithm via trace-based simulations.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, } @phdthesis{10.7907/296T-HR79, author = {Liu, Zhenhua}, title = {Sustainable IT and IT for Sustainability}, school = {California Institute of Technology}, year = {2014}, doi = {10.7907/296T-HR79}, url = {https://resolver.caltech.edu/CaltechTHESIS:05312014-215801543}, abstract = {
Energy and sustainability have become one of the most critical issues of our generation. While the abundant potential of renewable energy such as solar and wind provides a real opportunity for sustainability, their intermittency and uncertainty present a daunting operating challenge. This thesis aims to develop analytical models, deployable algorithms, and real systems to enable efficient integration of renewable energy into complex distributed systems with limited information.
The first thrust of the thesis is to make IT systems more sustainable by facilitating the integration of renewable energy into these systems. IT represents the fastest growing sectors in energy usage and greenhouse gas pollution. Over the last decade there are dramatic improvements in the energy efficiency of IT systems, but the efficiency improvements do not necessarily lead to reduction in energy consumption because more servers are demanded. Further, little effort has been put in making IT more sustainable, and most of the improvements are from improved “engineering” rather than improved “algorithms”. In contrast, my work focuses on developing algorithms with rigorous theoretical analysis that improve the sustainability of IT. In particular, this thesis seeks to exploit the flexibilities of cloud workloads both (i) in time by scheduling delay-tolerant workloads and (ii) in space by routing requests to geographically diverse data centers. These opportunities allow data centers to adaptively respond to renewable availability, varying cooling efficiency, and fluctuating energy prices, while still meeting performance requirements. The design of the enabling algorithms is however very challenging because of limited information, non-smooth objective functions and the need for distributed control. Novel distributed algorithms are developed with theoretically provable guarantees to enable the “follow the renewables” routing. Moving from theory to practice, I helped HP design and implement industry’s first Net-zero Energy Data Center.
The second thrust of this thesis is to use IT systems to improve the sustainability and efficiency of our energy infrastructure through data center demand response. The main challenges as we integrate more renewable sources to the existing power grid come from the fluctuation and unpredictability of renewable generation. Although energy storage and reserves can potentially solve the issues, they are very costly. One promising alternative is to make the cloud data centers demand responsive. The potential of such an approach is huge.
To realize this potential, we need adaptive and distributed control of cloud data centers and new electricity market designs for distributed electricity resources. My work is progressing in both directions. In particular, I have designed online algorithms with theoretically guaranteed performance for data center operators to deal with uncertainties under popular demand response programs. Based on local control rules of customers, I have further designed new pricing schemes for demand response to align the interests of customers, utility companies, and the society to improve social welfare.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/NRXJ-JB76, author = {Lin, Minghong}, title = {Algorithmic Challenges in Green Data Centers}, school = {California Institute of Technology}, year = {2013}, doi = {10.7907/NRXJ-JB76}, url = {https://resolver.caltech.edu/CaltechTHESIS:05312013-223354639}, abstract = {With data centers being the supporting infrastructure for a wide range of IT services, their efficiency has become a big concern to operators, as well as to society, for both economic and environmental reasons. The goal of this thesis is to design energy-efficient algorithms that reduce energy cost while minimizing compromise to service. We focus on the algorithmic challenges at different levels of energy optimization across the data center stack. The algorithmic challenge at the device level is to improve the energy efficiency of a single computational device via techniques such as job scheduling and speed scaling. We analyze the common speed scaling algorithms in both the worst-case model and stochastic model to answer some fundamental issues in the design of speed scaling algorithms. The algorithmic challenge at the local data center level is to dynamically allocate resources (e.g., servers) and to dispatch the workload in a data center. We develop an online algorithm to make a data center more power-proportional by dynamically adapting the number of active servers. The algorithmic challenge at the global data center level is to dispatch the workload across multiple data centers, considering the geographical diversity of electricity price, availability of renewable energy, and network propagation delay. We propose algorithms to jointly optimize routing and provisioning in an online manner. Motivated by the above online decision problems, we move on to study a general class of online problem named “smoothed online convex optimization”, which seeks to minimize the sum of a sequence of convex functions when “smooth” solutions are preferred. This model allows us to bridge different research communities and help us get a more fundamental understanding of general online decision problems.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/AWE2-H976, author = {Gopalakrishnan, Ragavendran}, title = {Characterizing Distribution Rules for Cost Sharing Games}, school = {California Institute of Technology}, year = {2013}, doi = {10.7907/AWE2-H976}, url = {https://resolver.caltech.edu/CaltechTHESIS:06032013-104204451}, abstract = {In noncooperative cost sharing games, individually strategic agents choose resources based on how the welfare (cost or revenue) generated at each resource (which depends on the set of agents that choose the resource) is distributed. The focus is on finding distribution rules that lead to stable allocations, which is formalized by the concept of Nash equilibrium, e.g., Shapley value (budget-balanced) and marginal contribution (not budget-balanced) rules.
Recent work that seeks to characterize the space of all such rules shows that the only budget-balanced distribution rules that guarantee equilibrium existence in all welfare sharing games are generalized weighted Shapley values (GWSVs), by exhibiting a specific ‘worst-case’ welfare function which requires that GWSV rules be used. Our work provides an exact characterization of the space of distribution rules (not necessarily budget-balanced) for any specific local welfare functions remains, for a general class of scalable and separable games with well-known applications, e.g., facility location, routing, network formation, and coverage games.
We show that all games conditioned on any fixed local welfare functions possess an equilibrium if and only if the distribution rules are equivalent to GWSV rules on some ‘ground’ welfare functions. Therefore, it is neither the existence of some worst-case welfare function, nor the restriction of budget-balance, which limits the design to GWSVs. Also, in order to guarantee equilibrium existence, it is necessary to work within the class of potential games, since GWSVs result in (weighted) potential games.
We also provide an alternative characterization—all games conditioned on any fixed local welfare functions possess an equilibrium if and only if the distribution rules are equivalent to generalized weighted marginal contribution (GWMC) rules on some ‘ground’ welfare functions. This result is due to a deeper fundamental connection between Shapley values and marginal contributions that our proofs expose—they are equivalent given a transformation connecting their ground welfare functions. (This connection leads to novel closed-form expressions for the GWSV potential function.) Since GWMCs are more tractable than GWSVs, a designer can tradeoff budget-balance with computational tractability in deciding which rule to implement.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @phdthesis{10.7907/GDV2-YF12, author = {Bodine-Baron, Elizabeth Anne}, title = {Peer Effects in Social Networks: Search, Matching Markets, and Epidemics}, school = {California Institute of Technology}, year = {2012}, doi = {10.7907/GDV2-YF12}, url = {https://resolver.caltech.edu/CaltechTHESIS:05222012-145639265}, abstract = {Social network analysis emerged as an important area in sociology in the early 1930s, marking a shift from looking at individual attribute data to examining the relationships between people and groups. Surveying many different types of real-world networks, researchers quickly found that different types of social networks tend to share a common set of structural characteristics, including small diameter, high clustering, and heavy-tailed degree distributions. Moving beyond real networks, in the 1990s researchers began to propose random network models to explain these commonly observed social network structures. These models laid the foundation for investigation into problems where the underlying network plays a key role, from the spread of information and disease, to the design of distributed communication and search algorithms, to mechanism design and public policy. Here we focus on the role of peer effects in social networks. Through this lens, we develop a mathematically tractable random network model incorporating searchability, propose a novel way to model and analyze two-sided matching markets with externalities, model and calculate the cost of an epidemic spreading on a complex network, and examine the impact of conforming and non-conforming peer effects in vaccination decisions on public health policy.
Throughout this work, the goal is to bring together knowledge and techniques from diverse fields like sociology, engineering, and economics, exploiting our understanding of social network structure and generative models to understand deeper problems that — without this knowledge — could be intractable. Instead of crippling our analysis, social network characteristics allow us to reach deeper insights about the interaction between a particular problem and the network underlying it.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Hassibi, Babak}, } @phdthesis{10.7907/AAXJ-EX10, author = {Nair, Jayakrishnan U.}, title = {Scheduling for Heavy-Tailed and Light-Tailed Workloads in Queueing Systems}, school = {California Institute of Technology}, year = {2012}, doi = {10.7907/AAXJ-EX10}, url = {https://resolver.caltech.edu/CaltechTHESIS:06012012-134536732}, abstract = {In much of classical queueing theory, workloads are assumed to be light-tailed, with job sizes being described using exponential or phase type distributions. However, over the past two decades, studies have shown that several real-world workloads exhibit heavy-tailed characteristics. As a result, there has been a strong interest in studying queues with heavy-tailed workloads. So at this stage, there is a large body of literature on queues with light-tailed workloads, and a large body of literature on queues with heavy-tailed workloads. However, heavy-tailed workloads and light-tailed workloads differ considerably in their behavior, and these two types of workloads are rarely studied jointly.
In this thesis, we design scheduling policies for queueing systems, considering both heavy-tailed as well as light-tailed workloads. The motivation for this line of work is twofold. First, since real world workloads can be heavy-tailed or light-tailed, it is desirable to design schedulers that are robust in their performance to distributional assumptions on the workload. Second, there might be scenarios where a heavy-tailed and a light-tailed workload interact in a queueing system. In such cases, it is desirable to design schedulers that guarantee fairness in resource allocation for both workload types.
In this thesis, we study three models involving the design of scheduling disciplines for both heavy-tailed as well as light-tailed workloads. In Chapters 3 and 4, we design schedulers that guarantee robust performance across heavy-tailed and light-tailed workloads. In Chapter 5, we consider a setting in which a heavy-tailed and a light-tailed workload complete for service. In this setting, we design scheduling policies that guarantee good response time tail performance for both workloads, while also maintaining throughput optimality.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Low, Steven H. and Wierman, Adam C.}, } @mastersthesis{10.7907/DBXG-1N54, author = {Lin, Minghong}, title = {Algorithmic Issues in Green Data Centers}, school = {California Institute of Technology}, year = {2011}, doi = {10.7907/DBXG-1N54}, url = {https://resolver.caltech.edu/CaltechTHESIS:10262010-122247121}, abstract = {Power consumption imposes a significant cost for data centers. Thus, it is not surprising that optimizing energy cost in data center is receiving increasing attention. In this thesis, we focus on the algorithmic issues at three levels of energy optimization for data centers: server level, local data center level and global data center level. At the server level, we analyze the common speed scaling algorithms in both worst-case model and stochastic model to answer some fundamental issues in the design of speed scaling algorithms. At the local data center level, we develop an online algorithm to make data center more power-proportional by dynamically adapting the number of active servers to match the current workload. At the global data center level, we propose a framework to explore the diversity of power prices and the diversity of propagation delays given geographically distributed data centers.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, } @mastersthesis{10.7907/QGF7-4K56, author = {Liu, Zhenhua}, title = {Greening Geographical Load Balancing}, school = {California Institute of Technology}, year = {2011}, doi = {10.7907/QGF7-4K56}, url = {https://resolver.caltech.edu/CaltechTHESIS:05252011-221926445}, abstract = {Energy expenditure has become a significant fraction of data center operating costs. Recently, “geographical load balancing” has been suggested to reduce energy cost by exploiting the electricity price differences across regions. However, this reduction of cost can paradoxically increase total energy use. This work explores whether the geographical diversity of internet-scale systems can additionally be used to provide environmental gains.
We first focus on geographical load balancing, which is modeled as a convex optimization problem. We derive two distributed algorithms for achieving optimal geographical load balancing and characterize the optimal solutions.
Then we continue to use the framework and algorithms to investigate whether geographical load balancing can encourage use of “green” renewable energy and reduce use of “brown” fossil fuel energy. Here we consider two approaches, namely, dynamic pricing and local renewables.
For the dynamic pricing case, our numeric results show that if electricity is dynamically priced in proportion to the instantaneous fraction of the total energy that is brown, then geographical load balancing provides significant reductions in brown energy use. However, the benefits depend strongly on the degree to which systems accept dynamic energy pricing and the form of pricing used.
For the local renewable case, we perform a trace-based study to evaluate three issues related to achieving this goal: the impact of geographical load balancing, the role of storage, and the optimal mix of renewables. Our results highlight that geographical load balancing can significantly reduce the required capacity of renewable energy by using the energy more efficiently with “follow the renewables” routing. Further, our results show that small-scale storage can be useful, especially in combination with geographical load balancing, and that an optimal mix of renewables includes significantly more wind than photovoltaic solar.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, } @mastersthesis{10.7907/ZW5K-AF41, author = {Gopalakrishnan, Ragavendran}, title = {An Architectural View of Game Theoretic Control}, school = {California Institute of Technology}, year = {2010}, doi = {10.7907/ZW5K-AF41}, url = {https://resolver.caltech.edu/CaltechTHESIS:05272010-163702257}, abstract = {Resource allocation has long been a fundamental research problem across several disciplines. While traditional approaches to this problem were centralized, recent research has focussed on distributed solutions for resource allocation, for reasons of scalability, reliability and efficiency in many real-world applications. Game-theoretic control is a promising new approach for distributed resource allocation. In this thesis, we describe how game-theoretic control can be viewed as having an intrinsic layered architecture, which provides a modularization that simplifies the control design. We illustrate this architectural view by presenting details about one particular instantiation using potential games as an interface. This example serves to highlight the strengths and limitations of the proposed architecture while also illustrating the relationship between game-theoretic control and other existing approaches to distributed resource allocation. We also demonstrate the power of this approach by reformulating the power control problem in sensor networks as a game-theoretic control problem in the potential games instantiation of our framework. This allows us to relax several assumptions made by previous contributions, and consider more complex objective functions.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Wierman, Adam C.}, }