@phdthesis{10.7907/gvgq-3d11, author = {Watts, Jerrell R.}, title = {Dynamic load balancing and granularity control on heterogeneous and hybrid architectures}, school = {California Institute of Technology}, year = {1998}, doi = {10.7907/gvgq-3d11}, url = {https://resolver.caltech.edu/CaltechETD:etd-02072008-075916}, abstract = {
The past several years have seen concurrent applications grow increasingly complex, as the most advanced techniques from academia find their way into production parallel applications. Moreover, the platforms on which these concurrent computations now execute are frequently heterogeneous networks of workstations and shared-memory multiprocessors, because of their low cost relative to traditional large-scale multicomputers. The combination of sophisticated algorithms and more complex computing environments has made existing load balancing techniques obsolete. Current methods characterize the loads of tasks in very simple terms, often fail to account for the communication costs of an application, and typically consider computational resources to be homogeneous. The complexity of current applications coupled with the fact that they are running in heterogeneous environments has also made partitioning a problem for concurrent execution an ordeal. It is no longer adequate to simply divide the problem into some number of pieces per computer and hope for the best. In a complex application, the workloads of the pieces, which may be equal initially, may diverge over time. On a heterogeneous network, the varying capabilities of the computers will widen this disparity in resource usage even further. Thus, there is a need to dynamically manage the granularity of an application, repartitioning the problem at runtime to correct inadequacies in the original partitioning and to make more effective use of computational resources.
This thesis presents techniques for dynamic load balancing in complex irregular applications. Advances over previous work are three-fold: First, these techniques are applicable to networks comprised of heterogeneous machines, including both single- processor workstations and personal computers, and multiprocessor compute servers. Second, the use of load vectors more accurately characterizes the resource requirements of tasks, including the computational demands of different algorithmic phases as well as the needs for other resources, such as memory. Finally, runtime repartitioning adjusts the granularity of the problem so that the available resources are more fully utilized. Two other improvements over earlier techniques include improved algorithms for determining the ideal redistribution of work as well as advanced techniques for selecting which tasks to transfer to satisfy those ideals. The latter algorithms incorporate the notion of task migration costs, including the impact on an application’s communications locality. The improvements listed above are demonstrated on both industrial applications and small parametric problems on networks of heterogeneous computers as well as traditional large-scale multicomputers.
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, } @phdthesis{10.7907/sx57-5d89, author = {Rieffel, Marc A.}, title = {Performance modeling for concurrent particle simulations}, school = {California Institute of Technology}, year = {1998}, doi = {10.7907/sx57-5d89}, url = {https://resolver.caltech.edu/CaltechETD:etd-01242008-132610}, abstract = {This thesis develops an application- and architecture-independent framework for predicting the runtime and memory requirements of particle simulations in complex three-dimensional geometries. Both sequential and concurrent simulations are addressed, on a variety of homogeneous and heterogeneous architectures. The models are considered in the context of neutral flow Direct Simulation Monte Carlo (DSMC) simulations for semiconductor manufacturing and aerospace applications.
Complex physical and chemical processes render algorithmic analysis alone insufficient for understanding the performance characteristics of particle simulations. For this reason, detailed knowledge of the interaction between the physics and chemistry of a problem and the numerical method used to solve it is required.
Prediction of runtime and storage requirements of sequential and concurrent particle simulations is possible with the use of these models. The feasibility of simulations for given physical systems can also be determined. While the present work focuses on the concurrent DSMC method, the same modeling techniques can be applied to other numerical methods, such as Particle-In-Cell (PIC) and Navier-Stokes (NS).
}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, } @phdthesis{10.7907/mbmg-ne39, author = {Palmer, Michael Edward}, title = {Exploiting parallel memory hierarchies for ray casting volumes}, school = {California Institute of Technology}, year = {1997}, doi = {10.7907/mbmg-ne39}, url = {https://resolver.caltech.edu/CaltechETD:etd-01162008-080520}, abstract = {Previous work in single-processor ray casting methods for volume rendering has concentrated on algorithmic optimizations to reduce computational work. This approach leaves untapped the performance gains which are possible through efficient exploitation of the memory hierarchy.
Previous work in parallel volume rendering has concentrated on parallel partitioning, with the goals of maximizing load balance and minimizing communication between distributed nodes. This implies a simplified view of the memory hierarchy of a parallel machine, ignoring the relationship between parallel partitioning and memory hierarchy effects at all but the top level.
In this thesis, we progressively develop methods to optimize memory hierarchy performance for ray casting: 1) on a uniprocessor, using algorithmic modifications to isolate cache miss costs, specialized hardware to monitor cache misses on the bus, and a software cache simulator; 2) on the a shared-memory Power Challenge multiprocessor, examining the fundamental dependence of algorithmic design decisions regarding parallel partitioning upon memory hierarchy effects at several levels; and 3) on a distributed array of interconnected Power Challenge multiprocessors, on which we implement a logical global address space for volume blocks, and investigate the tradeoff between replication (caching) and communication of data.
The methods we develop permit us to exploit the coherence found in volume rendering to increase memory locality, and thereby increase memory system performance. This focus on the optimal exploitation of the entire memory hierarchy, from the processor cache, to the interconnection network between distributed nodes, yields faster frame rates for large (357 MB to 1 GB) datasets than have been previously cited in the literature, and allows us to efficiently render a 7.1 GB dataset, the largest ever rendered.
Our results have implications for the parallel solution of other problems which, likeray casting, require a global gather operation, use an associative operator to combine partial results, and contain coherence. We discuss implications for the design of a parallel architecture suited to solving this class of problems, specifically, that these algorithms are best served by a deep memory hierarchy.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, } @phdthesis{10.7907/tyap-ea69, author = {Maskit, Daniel}, title = {Software register synchronization for super-scalar processors with partitioned register files}, school = {California Institute of Technology}, year = {1997}, doi = {10.7907/tyap-ea69}, url = {https://resolver.caltech.edu/CaltechETD:etd-01102008-153402}, abstract = {Increases in high-end microprocessor performance are becoming increasingly reliant on simultaneous issuing of instructions to multiple functional units on a single chip. As the number of functional units increases, the chip area, wire lengths, and delays required for a monolithic register file become unreasonable. Future microprocessors will have partitioned register files. The correctness of contemporary super-scalar processors relies on synchronized accesses to registers. This issue will be critical in systems with partitioned register files. Current techniques for managing register access ordering, such as register score boarding and register renaming, are inadequate for architectures with partitioned register files. This thesis demonstrates the difficulties of implementing these techniques with a partitioned register file, and introduces a novel compiler algorithm which addresses this issue.
Whenever a processor using register scoreboarding or register renaming issues an instruction, either the scoreboard or the register name table must be accessed to check the instruction’s sources and destination. If the register file is partitioned, checking the scoreboard or name table for a remote register is difficult. One functional unit cannot determine at runtime when it is safe to write to a register in another functional unit’s register file. While these techniques can be supported through use of a global or partitioned scoreboard, such an implementation would be complex, and have latency problems similar to those of a monolithic register file.
This work discusses the organization of multiple functional units into loosely-coupled groups of functional units that can communicate via direct register writes, but with purely local hardware interlocks to force synchronization. A novel compiler algorithm, Software Register Synchronization (SRS), is introduced. A comparison between SRS and existing hardware mechanisms is conducted using the Multiflow compiler modified to generate code for the MIT M-Machine, Experiments to evaluate the SRS algorithm are run on the M-Machine simulator being used for architectural verification. In order to support partitioned register file architectures, an alternative to traditional hardware methods for managing register synchronization needs to be developed. This thesis presents a novel compiler algorithm to address this need. The SRS algorithm is described, demonstrated to be correct, and evaluated. Details of the implementation of the SRS algorithm within the Multiflow compiler for the MIT M-Machine are provided.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, } @mastersthesis{10.7907/jyz7-7019, author = {Watts, Jerrell}, title = {A Practical Approach to Dynamic Load Balancing}, school = {California Institute of Technology}, year = {1995}, doi = {10.7907/jyz7-7019}, url = {https://resolver.caltech.edu/CaltechTHESIS:04122012-160604635}, abstract = {No abstract.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, } @mastersthesis{10.7907/z35v-1x17, author = {Maskit, Daniel}, title = {A Message-Driven Programming System for Fine-Grain Multicomputers}, school = {California Institute of Technology}, year = {1994}, doi = {10.7907/z35v-1x17}, url = {https://resolver.caltech.edu/CaltechTHESIS:04122012-095225392}, abstract = {No abstract.}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Taylor, Stephen}, }