@phdthesis{10.7907/ntem-sn47, author = {Chen, Kuan-Chang}, title = {Energy-Efficient Receiver Design for High-Speed Interconnects}, school = {California Institute of Technology}, year = {2022}, doi = {10.7907/ntem-sn47}, url = {https://resolver.caltech.edu/CaltechTHESIS:08042021-231915829}, abstract = {

High-speed interconnects are of vital importance to the operation of high-performance computing and communication systems, determining the ultimate bandwidth or data rates at which the information can be exchanged. Optical interconnects and the employment of high-order modulation formats are considered as the solutions to fulfilling the envisioned speed and power efficiency of future interconnects. One common key factor in bringing the success is the availability of energy-efficient receivers with superior sensitivity. To enhance the receiver sensitivity, improvement in the signal-to-noise ratio (SNR) of the front-end circuits, or equalization that mitigates the detrimental inter-symbol interference (ISI) is required. In this dissertation, architectural and circuit-level energy-efficient techniques serving these goals are presented.

First, an avalanche photodetector (APD)-based optical receiver is described, which utilizes non-return-to-zero (NRZ) modulation and is applicable to burst-mode operation. For the purposes of improving the overall optical link energy efficiency as well as the link bandwidth, this optical receiver is designed to achieve high sensitivity and high reconfiguration speed. The high sensitivity is enabled by optimizing the SNR at the front-end through adjusting the APD responsivity via its reverse bias voltage, along with the incorporation of 2-tap feedforward equalization (FFE) and 2-tap decision feedback equalization (DFE) implemented in current-integrating fashion. The high reconfiguration speed is empowered by the proposed integrating dc and amplitude comparators, which eliminate the RC settling time constraints. The receiver circuits, excluding the APD die, are fabricated in 28-nm CMOS technology. The optical receiver achieves bit-error-rate (BER) better than 1E−12 at −16-dBm optical modulation amplitude (OMA), 2.24-ns reconfiguration time with 5-dB dynamic range, and 1.37-pJ/b energy efficiency at 25 Gb/s.

Second, a 4-level pulse amplitude modulation (PAM4) wireline receiver is described, which incorporates continuous time linear equalizers (CTLEs) and a 2-tap direct DFE dedicated to the compensation for the first and second post-cursor ISI. The direct DFE in a PAM4 receiver (PAM4-DFE) is made possible by the proposed CMOS track-and-regenerate slicer. This proposed slicer offers rail-to-rail digital feedback signals with significantly improved clock-to-Q delay performance. The reduced slicer delay relaxes the settling time constraint of the summer circuits and allows the stringent DFE timing constraint to be satisfied. With the availability of a direct DFE employing the proposed slicer, inductor-based bandwidth enhancement and loop-unrolling techniques, which can be power/area intensive, are not required. Fabricated in 28-nm CMOS technology, the PAM4 receiver achieves BER better than 1E−12 and 1.1-pJ/b energy efficiency at 60 Gb/s, measured over a channel with 8.2-dB loss at Nyquist frequency.

Third, digital neural-network-enhanced FFEs (NN-FFEs) for PAM4 analog-to-digital converter (ADC)-based optical interconnects are described. The proposed NN-FFEs employ a custom learnable piecewise linear (PWL) activation function to tackle the nonlinearities with short memory lengths. In contrast to the conventional Volterra equalizers where multipliers are utilized to generate the nonlinear terms, the proposed NN-FFEs leverage the custom PWL activation function for nonlinear operations and reduce the required number of multipliers, thereby improving the area and power efficiencies. Applications in the optical interconnects based on micro-ring modulators (MRMs) are demonstrated with simulation results of 50-Gb/s and 100-Gb/s links adopting PAM4 signaling. The proposed NN-FFEs and the conventional Volterra equalizers are synthesized with the standard-cell libraries in a commercial 28-nm CMOS technology, and their power consumptions and performance are compared. Better than 37% lower power overhead can be achieved by employing the proposed NN-FFEs, in comparison with the Volterra equalizer that leads to similar improvement in the symbol-error-rate (SER) performance.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/Z9P55KJ7, author = {Monge Osorio, Manuel Alejandro}, title = {Localization and Stimulation Techniques for Implantable Medical Electronics}, school = {California Institute of Technology}, year = {2017}, doi = {10.7907/Z9P55KJ7}, url = {https://resolver.caltech.edu/CaltechTHESIS:05312017-143935777}, abstract = {

Implantable medical devices (IMDs) are emerging as one of the keystones of tomorrow’s medical technology. Although they have enabled a revolution in medicine, from research to diagnosis to treatment, most of today’s devices have critical limitations. They are bulky, have low resolution, and, in some cases, are limited to basic functionality. Miniaturization of IMDs will have an enormous impact not only on the technology itself and the medical procedures they enable, but also on the lives of patients, who will be more comfortable, have greater confidence in their medical treatments, and enjoy an overall improvement in their quality of life. The path towards miniaturized bioelectronic devices requires a reevaluation of existing paradigms to reach a seamless integration of electronics and biology. Miniaturization of medical electronics then involves an exploration of advanced integrated circuit processes and novel circuit and system level architectures. In this dissertation, we provide an overview of implantable medical devices and present novel circuit and system level techniques for the miniaturization of medical electronics.

The function of wireless miniaturized medical devices such as capsule endoscopes, biosensors, and drug delivery systems depends critically on their location inside the body. However, existing electromagnetic, acoustic, and imaging-based methods for localizing and communicating with such devices with spatial selectivity are limited by the physical properties of tissue or imaging modality performance. In the first part of this dissertation, we introduce a new approach for microscale device localization by embodying the principles of nuclear magnetic resonance in a silicon integrated circuit. By analogy to the behavior of nuclear spins, we engineer miniaturized RF transmitters that encode their location in space by shifting their output frequency in proportion to the local magnetic field. The application of external field gradients then allows each device’s location to be determined precisely from the frequency of its signal. We demonstrate the core capabilities of these devices, which we call addressable transmitters operated as magnetic spins (ATOMS), in an integrated circuit smaller than 0.7 mm^3, manufactured through a standard 180 nm complementary metal-oxide-semiconductor (CMOS) process. We show that ATOMS are capable of sub-millimeter localization in vitro and in vivo. As a technology that is inherently robust to tissue properties and scalable to multiple devices, ATOMS localization provides an enabling capability for the development of microscale devices to monitor and treat disease.

In neuroprosthetics, retinal prostheses aim to restore vision in patients suffering from advanced stages of retinal degeneration (e.g., retinitis pigmentosa) by bypassing the damaged photoreceptors and directly stimulating the remaining healthy neurons. In the second part of this dissertation, we describe a fully intraocular self-calibrating epiretinal prosthesis that reduces area and power consumption, and increases the functionality and resolution of traditional implementations. We introduce a novel novel digital calibration technique that matches the biphasic stimulation currents of each channel independently while sharing the calibration circuitry among every 4 channels. The system-on-chip presents dual-band telemetry for power and data with on-chip rectifier and clock recovery. These techniques reduce the number of off-chip components and achieve a power conversion efficiency >80% and supporting data rates up to 20 Mb/s. The system occupies an area of 4.5 x 3.1 mm2 and is implemented in 65 nm CMOS . It features 512 independent channels with a pixel size of 0.0169 mm2 and arbitrary waveform generation per channel. The chip is integrated with flexible MEMS origami coils and parylene substrate to provide a fully intraocular implant.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/Z9K935HH, author = {Saeedi, Saman}, title = {Holistic Design In High-Speed Optical Interconnects}, school = {California Institute of Technology}, year = {2016}, doi = {10.7907/Z9K935HH}, url = {https://resolver.caltech.edu/CaltechTHESIS:10212015-150203289}, abstract = {

Integrated circuit scaling has enabled a huge growth in processing capability, which necessitates a corresponding increase in inter-chip communication bandwidth. As bandwidth requirements for chip-to-chip interconnection scale, deficiencies of electrical channels become more apparent. Optical links present a viable alternative due to their low frequency-dependent loss and higher bandwidth density in the form of wavelength division multiplexing. As integrated photonics and bonding technologies are maturing, commercialization of hybrid-integrated optical links are becoming a reality. Increasing silicon integration leads to better performance in optical links but necessitates a corresponding co-design strategy in both electronics and photonics. In this light, holistic design of high-speed optical links with an in-depth understanding of photonics and state-of-the-art electronics brings their performance to unprecedented levels. This thesis presents developments in high-speed optical links by co-designing and co-integrating the primary elements of an optical link: receiver, transmitter, and clocking.

In the first part of this thesis a 3D-integrated CMOS/Silicon-photonic receiver will be presented. The electronic chip features a novel design that employs a low-bandwidth TIA front-end, double-sampling and equalization through dynamic offset modulation. Measured results show -14.9dBm of sensitivity and energy efficiency of 170fJ/b at 25Gb/s. The same receiver front-end is also used to implement source-synchronous 4-channel WDM-based parallel optical receiver. Quadrature ILO-based clocking is employed for synchronization and a novel frequency-tracking method that exploits the dynamics of IL in a quadrature ring oscillator to increase the effective locking range. An adaptive body-biasing circuit is designed to maintain the per-bit-energy consumption constant across wide data-rates. The prototype measurements indicate a record-low power consumption of 153fJ/b at 32Gb/s. The receiver sensitivity is measured to be -8.8dBm at 32Gb/s.

Next, on the optical transmitter side, three new techniques will be presented. First one is a differential ring modulator that breaks the optical bandwidth/quality factor trade-off known to limit the speed of high-Q ring modulators. This structure maintains a constant energy in the ring to avoid pattern-dependent power droop. As a first proof of concept, a prototype has been fabricated and measured up to 10Gb/s. The second technique is thermal stabilization of micro-ring resonator modulators through direct measurement of temperature using a monolithic PTAT temperature sensor. The measured temperature is used in a feedback loop to adjust the thermal tuner of the ring. A prototype is fabricated and a closed-loop feedback system is demonstrated to operate at 20Gb/s in the presence of temperature fluctuations. The third technique is a switched-capacitor based pre-emphasis technique designed to extend the inherently low bandwidth of carrier injection micro-ring modulators. A measured prototype of the optical transmitter achieves energy efficiency of 342fJ/bit at 10Gb/s and the wavelength stabilization circuit based on the monolithic PTAT sensor consumes 0.29mW.

Lastly, a first-order frequency synthesizer that is suitable for high-speed on-chip clock generation will be discussed. The proposed design features an architecture combining an LC quadrature VCO, two sample-and-holds, a PI, digital coarse-tuning, and rotational frequency detection for fine-tuning. In addition to an electrical reference clock, as an extra feature, the prototype chip is capable of receiving a low jitter optical reference clock generated by a high-repetition-rate mode-locked laser. The output clock at 8GHz has an integrated RMS jitter of 490fs, peak-to-peak periodic jitter of 2.06ps, and total RMS jitter of 680fs. The reference spurs are measured to be –64.3dB below the carrier frequency. At 8GHz the system consumes 2.49mW from a 1V supply.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/Z90P0WZD, author = {Raj, Mayank}, title = {Injection Locked Clocking and Transmitter Equalization Techniques for Chip to Chip Interconnects}, school = {California Institute of Technology}, year = {2015}, doi = {10.7907/Z90P0WZD}, url = {https://resolver.caltech.edu/CaltechTHESIS:11062014-090236636}, abstract = {

Semiconductor technology scaling has enabled drastic growth in the computational capacity of integrated circuits (ICs). This constant growth drives an increasing demand for high bandwidth communication between ICs. Electrical channel bandwidth has not been able to keep up with this demand, making I/O link design more challenging. Interconnects which employ optical channels have negligible frequency dependent loss and provide a potential solution to this I/O bandwidth problem. Apart from the type of channel, efficient high-speed communication also relies on generation and distribution of multi-phase, high-speed, and high-quality clock signals. In the multi-gigahertz frequency range, conventional clocking techniques have encountered several design challenges in terms of power consumption, skew and jitter. Injection-locking is a promising technique to address these design challenges for gigahertz clocking. However, its small locking range has been a major contributor in preventing its ubiquitous acceptance.

In the first part of this dissertation we describe a wideband injection locking scheme in an LC oscillator. Phase locked loop (PLL) and injection locking elements are combined symbiotically to achieve wide locking range while retaining the simplicity of the latter. This method does not require a phase frequency detector or a loop filter to achieve phase lock. A mathematical analysis of the system is presented and the expression for new locking range is derived. A locking range of 13.4 GHz–17.2 GHz (25%) and an average jitter tracking bandwidth of up to 400 MHz are measured in a high-Q LC oscillator. This architecture is used to generate quadrature phases from a single clock without any frequency division. It also provides high frequency jitter filtering while retaining the low frequency correlated jitter essential for forwarded clock receivers.

To improve the locking range of an injection locked ring oscillator; QLL (Quadrature locked loop) is introduced. The inherent dynamics of injection locked quadrature ring oscillator are used to improve its locking range from 5% (7-7.4GHz) to 90% (4-11GHz). The QLL is used to generate accurate clock phases for a four channel optical receiver using a forwarded clock at quarter-rate. The QLL drives an injection locked oscillator (ILO) at each channel without any repeaters for local quadrature clock generation. Each local ILO has deskew capability for phase alignment. The optical-receiver uses the inherent frequency to voltage conversion provided by the QLL to dynamically body bias its devices. A wide locking range of the QLL helps to achieve a reliable data-rate of 16-32Gb/s and adaptive body biasing aids in maintaining an ultra-low power consumption of 153pJ/bit.

From the optical receiver we move on to discussing a non-linear equalization technique for a vertical-cavity surface-emitting laser (VCSEL) based optical transmitter, to enable low-power, high-speed optical transmission. A non-linear time domain optical model of the VCSEL is built and evaluated for accuracy. The modelling shows that, while conventional FIR-based pre-emphasis works well for LTI electrical channels, it is not optimum for the non-linear optical frequency response of the VCSEL. Based on the simulations of the model an optimum equalization methodology is derived. The equalization technique is used to achieve a data-rate of 20Gb/s with power efficiency of 0.77pJ/bit.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/FQ18-2X96, author = {Loh Rui Yan, Matthew}, title = {Dense, Efficient Chip-to-Chip Communication at the Extremes of Computing}, school = {California Institute of Technology}, year = {2013}, doi = {10.7907/FQ18-2X96}, url = {https://resolver.caltech.edu/CaltechTHESIS:05082013-113725728}, abstract = {

The scalability of CMOS technology has driven computation into a diverse range of applications across the power consumption, performance and size spectra. Communication is a necessary adjunct to computation, and whether this is to push data from node-to-node in a high-performance computing cluster or from the receiver of wireless link to a neural stimulator in a biomedical implant, interconnect can take up a significant portion of the overall system power budget. Although a single interconnect methodology cannot address such a broad range of systems efficiently, there are a number of key design concepts that enable good interconnect design in the age of highly-scaled CMOS: an emphasis on highly-digital approaches to solving ‘analog’ problems, hardware sharing between links as well as between different functions (such as equalization and synchronization) in the same link, and adaptive hardware that changes its operating parameters to mitigate not only variation in the fabrication of the link, but also link conditions that change over time. These concepts are demonstrated through the use of two design examples, at the extremes of the power and performance spectra.

A novel all-digital clock and data recovery technique for high-performance, high density interconnect has been developed. Two independently adjustable clock phases are generated from a delay line calibrated to 2 UI. One clock phase is placed in the middle of the eye to recover the data, while the other is swept across the delay line. The samples produced by the two clocks are compared to generate eye information, which is used to determine the best phase for data recovery. The functions of the two clocks are swapped after the data phase is updated; this ping-pong action allows an infinite delay range without the use of a PLL or DLL. The scheme’s generalized sampling and retiming architecture is used in a sharing technique that saves power and area in high-density interconnect. The eye information generated is also useful for tuning an adaptive equalizer, circumventing the need for dedicated adaptation hardware.

On the other side of the performance/power spectra, a capacitive proximity interconnect has been developed to support 3D integration of biomedical implants. In order to integrate more functionality while staying within size limits, implant electronics can be embedded onto a foldable parylene (‘origami’) substrate. Many of the ICs in an origami implant will be placed face-to-face with each other, so wireless proximity interconnect can be used to increase communication density while decreasing implant size, as well as facilitate a modular approach to implant design, where pre-fabricated parylene-and-IC modules are assembled together on-demand to make custom implants. Such an interconnect needs to be able to sense and adapt to changes in alignment. The proposed array uses a TDC-like structure to realize both communication and alignment sensing within the same set of plates, increasing communication density and eliminating the need to infer link quality from a separate alignment block. In order to distinguish the communication plates from the nearby ground plane, a stimulus is applied to the transmitter plate, which is rectified at the receiver to bias a delay generation block. This delay is in turn converted into a digital word using a TDC, providing alignment information.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/TSX2-EE48, author = {Honarvar Nazari, Meisam}, title = {Electrical and Optical Interconnects for High-Performance Computing}, school = {California Institute of Technology}, year = {2013}, doi = {10.7907/TSX2-EE48}, url = {https://resolver.caltech.edu/CaltechTHESIS:05282013-164038057}, abstract = {

Technology scaling has enabled drastic growth in the computational and storage capacity of integrated circuits (ICs). This constant growth drives an increasing demand for high-bandwidth communication between and within ICs. In this dissertation we focus on low-power solutions that address this demand. We divide communication links into three subcategories depending on the communication distance. Each category has a different set of challenges and requirements and is affected by CMOS technology scaling in a different manner. We start with short-range chip-to-chip links for board-level communication. Next we will discuss board-to-board links, which demand a longer communication range. Finally on-chip links with communication ranges of a few millimeters are discussed.

Electrical signaling is a natural choice for chip-to-chip communication due to efficient integration and low cost. IO data rates have increased to the point where electrical signaling is now limited by the channel bandwidth. In order to achieve multi-Gb/s data rates, complex designs that equalize the channel are necessary. In addition, a high level of parallelism is central to sustaining bandwidth growth. Decision feedback equalization (DFE) is one of the most commonly employed techniques to overcome the limited bandwidth problem of the electrical channels. A linear and low-power summer is the central block of a DFE. Conventional approaches employ current-mode techniques to implement the summer, which require high power consumption. In order to achieve low-power operation we propose performing the summation in the charge domain. This approach enables a low-power and compact realization of the DFE as well as crosstalk cancellation. A prototype receiver was fabricated in 45nm SOI CMOS to validate the functionality of the proposed technique and was tested over channels with different levels of loss and coupling. Measurement results show that the receiver can equalize channels with maximum 21dB loss while consuming about 7.5mW from a 1.2V supply. We also introduce a compact, low-power transmitter employing passive equalization. The efficacy of the proposed technique is demonstrated through implementation of a prototype in 65nm CMOS. The design achieves up to 20Gb/s data rate while consuming less than 10mW.

An alternative to electrical signaling is to employ optical signaling for chip-to-chip interconnections, which offers low channel loss and cross-talk while providing high communication bandwidth. In this work we demonstrate the possibility of building compact and low-power optical receivers. A novel RC front-end is proposed that combines dynamic offset modulation and double-sampling techniques to eliminate the need for a short time constant at the input of the receiver. Unlike conventional designs, this receiver does not require a high-gain stage that runs at the data rate, making it suitable for low-power implementations. In addition, it allows time-division multiplexing to support very high data rates. A prototype was implemented in 65nm CMOS and achieved up to 24Gb/s with less than 0.4pJ/b power efficiency per channel. As the proposed design mainly employs digital blocks, it benefits greatly from technology scaling in terms of power and area saving.

As the technology scales, the number of transistors on the chip grows. This necessitates a corresponding increase in the bandwidth of the on-chip wires. In this dissertation, we take a close look at wire scaling and investigate its effect on wire performance metrics. We explore a novel on-chip communication link based on a double-sampling architecture and dynamic offset modulation technique that enables low power consumption and high data rates while achieving high bandwidth density in 28nm CMOS technology. The functionality of the link is demonstrated using different length minimum-pitch on-chip wires. Measurement results show that the link achieves up to 20Gb/s of data rate (12.5Gb/s/μm) with better than 136fJ/b of power efficiency.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, } @phdthesis{10.7907/Y3FA-VB87, author = {Yoo, Juhwan}, title = {Compressed Sensing Receivers: Theory, Design, and Performance Limits}, school = {California Institute of Technology}, year = {2012}, doi = {10.7907/Y3FA-VB87}, url = {https://resolver.caltech.edu/CaltechTHESIS:06122012-144158047}, abstract = {

The past 50 years have seen tremendous developments in electronics due to the rise and rapid development of IC-fabrication technology [1]. In addition to the production of cheap and abundant computing resources, another area of rapid advancement has been wireless technologies. While the central focus of wireless research has been mobile communication, an area of increasing importance concerns the development of sensing/spectral applications over bandwidths exceeding multiple GHz. Such systems have many applications ranging from scientific to military. Although some solutions exist, their large size, weight, and power make more-efficient solutions desirable.

At present, one of the principal bottlenecks in designing such systems is the power consumption of the back-end ADCs at the required digitization rate. ADCs are a dominant source of power consumption; it is also often the case that ADC block specifications are used to determine parameters for the rest of the signal chain, such as the RF front-end and the DSP-core which processes the digitized samples [2]. Historically, increases in system bandwidth have come from developing ADCs with superior performance.

In contrast to improving ADC performance, this work presents a system-level approach with the goal of minimizing the required digitization rate for observation of a given effective instantaneous bandwidth (EIBW). The approach was inspired by the field of compressed sensing [3–5]. Loosely stated, CS asserts that samples which represent random projections can be used to recover sparse and/or compressible signals with what was previously thought to be insufficient information. The primary contributions of this thesis include: the establishment of physical feasibility of CS-based receivers through implementation of the first fully-integrated high speed CS-based front-end known as the random-modulation pre-integrator (RMPI) [6–9], and the development of a principled design methodology based on a rigorous analytical and empirical feasibility study of the system.

The 8-channel RMPI was implemented in 90 nm CMOS and was validated by physical measurements of the fabricated chip. The implemented RMPI achieves an EIBW of 2 GHz, with > 54 dB of dynamic range. Most notably, the aggregate digitization rate is fs = 320 Msps, 12.5× lower than the Nyquist rate.

}, address = {1200 East California Boulevard, Pasadena, California 91125}, advisor = {Emami, Azita}, }