[
{
"id": "authors:ekw3n-6et55",
"collection": "authors",
"collection_id": "ekw3n-6et55",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120420-102640427",
"type": "monograph",
"title": "A mathematical approach to modelling the flow of data and control in computational networks",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
},
{
"family_name": "Cohen",
"given_name": "Danny",
"clpid": "Cohen-Danny"
}
],
"abstract": "This paper proposes a mathematical formalism for the synthesis and qualitative analysis of computational networks that treats data and control in the same manner. Expressions in this notation are given a direct interpretation in the implementation domain. Topology,\nbroadcasting, pipelining, and similar properties of implementations can be determined directly from the expressions.\n\nThis treatment of computational networks emphasizes the space/time tradeoff of implementations. A full instantiation in space of most computational problems is unrealistic, even in VLSI (Finnegan [4]). Therefore, computations also have to be at least partially\ninstantiated in the time domain, requiring the use of explicit control mechanisms, which typically cause the data flow to be nonstationary and sometimes turbulent.",
"doi": "10.7907/ekw3n-6et55",
"publisher": "California Institute of Technology",
"publication_date": "2012-05-02"
},
{
"id": "authors:bmm7d-81x26",
"collection": "authors",
"collection_id": "bmm7d-81x26",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120418-110634950",
"type": "monograph",
"title": "VLSI Architecture and Design",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "Integrated circuit technology is rapidly approaching a state where feature sizes of one micron or less are tractable. Chip sizes are increasing slowly. These two developments result in considerably increased complexity in chip design. The physical characteristics of integrated circuit technology are also changing. The cost of communication will be dominating making new architectures and algorithms both feasible and desirable. A large\nnumber of processors on a single chip will be possible. The cost of communication will make\ndesigns enforcing locality superior to other types of designs.\n\nScaling down feature sizes results in increase of the delay that wires introduce. The delay even of metal wires will become significant. Time tends to be a local property which will make the design of globally synchronous systems more difficult. Self-timed systems will eventually become a necessity. \n\nWith the chip complexity measured in terms of logic devices increasing by more than an order of magnitude over the next few years the importance of efficient design methodologies and tools become crucial. Hierarchical and structured design are ways of dealing with the complexity of chip design. Structered design focuses on the information\nflow and enforces a high degree of regularity. Both hierarchical and structured design encourage the use of cell libraries. The geometry of the cells in such libraries should be parameterized so that for instance cells can adjust there size to neighboring cells and make the proper interconnection. Cells with this quality can be used as a basis for \"Silicon Compilers\".",
"doi": "10.7907/bmm7d-81x26",
"publisher": "California Institute of Technology",
"publication_date": "2012-04-18"
},
{
"id": "authors:70tmd-29e82",
"collection": "authors",
"collection_id": "70tmd-29e82",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1981.4287-tr-81",
"type": "monograph",
"title": "Computational Arrays for Band Matrix Equations",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "No Abstract.",
"doi": "10.7907/70tmd-29e82",
"publisher": "California Institute of Technology",
"publication_date": "2002-11-27"
},
{
"id": "authors:f5pmx-pnx37",
"collection": "authors",
"collection_id": "f5pmx-pnx37",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1980.4087-tr-80",
"type": "monograph",
"title": "Gaussian Elimination on Sparse Matricies and Concurrency",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "No Abstract.",
"doi": "10.7907/f5pmx-pnx37",
"publisher": "California Institute of Technology",
"publication_date": "2002-08-29"
},
{
"id": "authors:77gav-sns10",
"collection": "authors",
"collection_id": "77gav-sns10",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1983.5092-tr-83",
"type": "monograph",
"title": "Residue Arithmetic and VLSI",
"author": [
{
"family_name": "Chiang",
"given_name": "Chao-Lin",
"clpid": "Chiang-Chao-Lin"
},
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "In the residue number system arithmetic is carried\nout on each digit individually. There is no carry chain.\nThis locality is of particular interest in VLSI. An \nevaluation of different implementations of residue arithmetic is carried out, and the effects of reduced feature sizes estimated. At the current state of technology the traditional table lookup method is preferable for a range that requires a maximum modulus that is represented by up to 4 bits, while an array of adders offers the best performance fur 7 or more bits. A combination of adders and\ntables covers 5 and 6 bits the best. At 0.5 mu m feature\nsize table lookup is competitive only up to 3 bits, These\nconclusions are based on sample designs in nMOS.",
"doi": "10.7907/77gav-sns10",
"publisher": "California Institute of Technology",
"publication_date": "2002-08-07"
},
{
"id": "authors:gz4dm-3tg53",
"collection": "authors",
"collection_id": "gz4dm-3tg53",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1983.5084-tr-83",
"type": "monograph",
"title": "The Tree Machine: An Evaluation of Strategies For Reducing Program Loading Time",
"author": [
{
"family_name": "Li",
"given_name": "Pey-yun Peggy",
"clpid": "Li-Pey-yun"
},
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "The Caltech Tree Machine has an ensemble architecture, Processors\nare interconnected into a binary tree. Each node executes its own code.\nNo two nodes need to execute identical code. Nodes are synchronized by\nmessages between adjacent nodes. Since the number of nodes is intended\nto be large, in the order of thousands, great care needs to be exercised\nin devising loading strategies to make the loading time as short as\npossible. A constraint is also imposed by the very limited storage\nassociated with a processor.\n\nNodes are assigned a type that identifies the code it shall execute.\nNodes of the same type execute identical code. Tree Machine programs\nare frequently very regular. By exploiting this regularity, compact\ndescriptions of the types of all nodes in the tree can be created. The\nlimited storage of a node, and the desire to only use local information\nin the expansion of the compacted description implies constraints on the\ncompression/decompression algorithms.\n\nA loading time proportional to the height of the tree is attainable\nin many cases with the algorithms presented. This time is also the\nworst case performance for one of the algorithms. The other algorithms\nhave a worst case performance of 0 square root of N/f and O square root of (N to the power of 1/log2f), where N is the total number of nodes in a tree with fanout f. The algorithms with a\nless favorable upper bound, in some cases allow a more compact tree\ndescription, than the algorithm with the best upper bound.",
"doi": "10.7907/gz4dm-3tg53",
"publisher": "California Institute of Technology",
"publication_date": "1983-01-01"
},
{
"id": "authors:64hjx-fv005",
"collection": "authors",
"collection_id": "64hjx-fv005",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1983.5079-tr-83",
"type": "monograph",
"title": "Highly Concurrent Algorithms for Solving Linear Systems of Equations",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "No Abstract.",
"doi": "10.7907/64hjx-fv005",
"publisher": "California Institute of Technology",
"publication_date": "1983-01-01"
},
{
"id": "authors:4e69j-smn59",
"collection": "authors",
"collection_id": "4e69j-smn59",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120420-155106097",
"type": "monograph",
"title": "A Formal Derivation of Array Implementations of FFT Algorithms",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
},
{
"family_name": "Cohen",
"given_name": "Danny",
"clpid": "Cohen-Danny"
}
],
"abstract": "Fast Fourier Transform, FFT, algorithms are interesting for direct hardware implementation in VLSI. The description of FFT algorithms is typically made either in terms of graphs illustrating the dependency between different data elements or in terms of mathematical expressions without any notion of how the computations are implemented in space or\ntime. Expressions in the notation used in this paper can be given an interpretation in the implementation domain. The notation is in this paper used to derive a description of array implementations of decimation-in-frequency and decimation-in-time FFT algorithms. Correctness of the implementations is guaranteed by way of derivation.",
"doi": "10.7907/4e69j-smn59",
"publisher": "California Institute of Technology",
"publication_date": "1982-11"
},
{
"id": "authors:vwwfw-anp96",
"collection": "authors",
"collection_id": "vwwfw-anp96",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1982.5052-tr-82",
"type": "monograph",
"title": "Submicron Systems Architecture: Semiannual Technical Report",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
},
{
"family_name": "Seitz",
"given_name": "Charles L.",
"clpid": "Seitz-C-L"
}
],
"abstract": "No Abstract.",
"doi": "10.7907/vwwfw-anp96",
"publisher": "California Institute of Technology",
"publication_date": "1982-01-01"
},
{
"id": "authors:6a5xt-r6216",
"collection": "authors",
"collection_id": "6a5xt-r6216",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1982.5040-tr-82",
"type": "monograph",
"title": "Concurrent Algorithms for the Conjugate Gradient Method",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "A few concurrent algorithms for the basic conjugate gradient method\nis devised and discussed. Most of the algorithms have a topology that\nis naturally determined by characteristic dimensions of the system and\nthe operations of each step of the conjugate gradient method. The\ntopologies map well onto buildable structures of sparsely interconnected\nprocessors while preserving unit communication distance. The topology\nof the algorithms are:\n\n1) A binary tree\n\n2) A composition of a binary tree and a ring the nodes of\nwhich forms the leaves of the tree.\n\n3 ) A linear array with some additional processing elements.\nIt is also discussed how these algorithms maps onto Boolean n-cubes.\nThe algorithms all have the property that a communication operation\nis associated with each computation.\n\n\nNo claim is made as to the optimality from a space-time complexity\npoint of the algorithms presented here. However, the processor\nutilization for some algorithms and topologies are close to 100% and the\nspace*time complexity of those algorithms are of the same order as the\narithmetic complexity of common sequential machine algorithms.",
"doi": "10.7907/6a5xt-r6216",
"publisher": "California Institute of Technology",
"publication_date": "1982-01-01"
},
{
"id": "authors:en871-srv73",
"collection": "authors",
"collection_id": "en871-srv73",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120419-115610622",
"type": "monograph",
"title": "Pipelined linear equation solvers and VLSI",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "Many of the commonly used methods for solution of linear systems of equations on sequential machines can be given a concurrent formulation. The concurrent algorithms take advantage of independence of operations in order to reduce the time complexity of the methods. During the course of computations specified by the algorithm data has to be routed to the various places of computation. Pipelining\ncan be used to avoid broadcasting in VLSI arrays for computation. Pipelining will in general allow for a reduced cycle time but may force data to be spread out in\ntime, as is the case for Gaussian elimination. What the required spacing is depends on the pipelining and the data flow.\n\nIn the paper concurrent algorithms and their pipelining for Gaussian elimination, Householder transformations and Given's rotations are discussed, Gaussian elimination and Given's rotations can use two dimensional arrays while Householder transformation uses a one dimensional array. If partial pivoting is necessary in Gaussian elimination, then one dimension of the array is essentially lost and s\nlinear array is almost as efficient as a two-dimensional array. Householder transformations that are numerically stable may perform the triangulation in shorter time, if partial pivoting is necessary in Gaussian elimination. The amount of arithmetic that a node in the arrays perform is somewhat different for the different methods. The difference is largest for the boundary cells. However, it\nshould be feasible to design a common node of very low complexity that very efficiently supports a range of methods for the solution of linear systems of\nequations.",
"doi": "10.7907/en871-srv73",
"publisher": "California Institute of Technology",
"publication_date": "1982"
},
{
"id": "authors:madaw-z5041",
"collection": "authors",
"collection_id": "madaw-z5041",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120423-165211870",
"type": "monograph",
"title": "A Computational Array for the QR-Method",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "The QR-method is a method for the solution of linear system of equations. The matrix R is upper triangular and Q is a unitary matrix. In equation solving Q is not always computed explicitly. The matrix R can be obtained by applying a sequence of unitary transformations to the matrix defining the system of equations. Householder's method or Given's method can be used to determine\nunitary transformation matrices. This paper describes a concurrent algorithm and corresponding array for computing the triangular matrix R by Householder transformations. Particular attention is given to issues such as broadcasting\nand pipelining.",
"doi": "10.7907/madaw-z5041",
"publisher": "California Institute of Technology",
"publication_date": "1982"
},
{
"id": "authors:4aq2m-bnw32",
"collection": "authors",
"collection_id": "4aq2m-bnw32",
"cite_using_url": "https://resolver.caltech.edu/CaltechAUTHORS:20120420-105611583",
"type": "monograph",
"title": "VLSI algorithms for Doolittle's, Crout's, and Cholesky's methods",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
}
],
"abstract": "In order to take full advantage of the emerging\nVLSI technology it is required to recognize its\nlimited communication capability and structure\nalgorithms accordingly. In this paper concurrent\nalgorithms for the methods of Crout, Doolittle and\nCholesky are described and compared with\nconcurrent algorithms for Gauss' , Given's and\nHouseholder's method. The effect of pipe lining the\ncomputations in two dimensional arrays is given\nspecial attention.",
"doi": "10.7907/4aq2m-bnw32",
"publisher": "California Institute of Technology",
"publication_date": "1982"
},
{
"id": "authors:gn21a-t6x26",
"collection": "authors",
"collection_id": "gn21a-t6x26",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1981.4191-tr-81",
"type": "monograph",
"title": "Towards a Formal Treatment of VLSI Arrays",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
},
{
"family_name": "Weiser",
"given_name": "Uri",
"clpid": "Weiser-U"
},
{
"family_name": "Cohen",
"given_name": "Danny",
"clpid": "Cohen-Danny"
},
{
"family_name": "Davis",
"given_name": "Alan L.",
"clpid": "Davis-A-L"
}
],
"abstract": "This paper presents a formalism for describing the behavior of computational networks at the\nalgorithmic level. It establishes a direct correspondence between the mathematical expressions\ndefining a function and the computational networks which compute that function. By formally\nmanipulating the symbolic expressions that define a function, it is possible to obtain different\nnetworks that compute the function. From this mathematical description of a network, one can\ndirectly determine certain important characteristics of computational networks, such as\ncomputational rate, performance and communication requirements. The use of this formalism for\ndesign and verification is demonstrated on computational networks for Finite Impulse Response (FIR)\nfilters, matrix operations, and the Discrete Fourier Transform (DFT).\nThe progression of computations can often be modeled by wave fronts in an illuminating way. The\nformalism supports this model. A computational network can be viewed in an abstract form that can\nbe represented as a graph. The duality between the graph representation and the mathematical\nexpressions is briefly introduced.",
"doi": "10.7907/gn21a-t6x26",
"publisher": "California Institute of Technology",
"publication_date": "1981-01-01"
},
{
"id": "authors:n91j8-85m21",
"collection": "authors",
"collection_id": "n91j8-85m21",
"cite_using_url": "https://resolver.caltech.edu/CaltechCSTR:1981.4168-tr-81",
"type": "monograph",
"title": "Computational Arrays for the Discrete Fourier Transform",
"author": [
{
"family_name": "Johnsson",
"given_name": "Lennart",
"clpid": "Johnsson-L"
},
{
"family_name": "Cohen",
"given_name": "Danny",
"clpid": "Cohen-Danny"
}
],
"abstract": "A mathematical approach towards the development of computational arrays for\nthe Discrete Fourier Transform (DFT) is pursued in this paper. Mathematical expressions\nfor the DFT are given a direct hardware interpretation. Different implementations are\ndeveloped by formal manipulation of the equations defining the DFT. Properties of the\nimplementations can be told directly from the corresponding equations. Special \nconsideration is given to the performance of implementations and corresponding hardware\nrequirements. The standard equations defining the DFT on N values corresponds if the\nequations are given a direct hardware interpretation to an Implementation requiring N\nto the power of 2 modules. By formal manipulation of the equations defining the DFT we develop\nimplementations requiring N and Log subscript2N modules respectively.",
"doi": "10.7907/n91j8-85m21",
"publisher": "California Institute of Technology",
"publication_date": "1981-01-01"
}
]