@string{OSDI96 = "Proc. of the Second Symposium on Operating Systems Design and Implementation"} @InProceedings{Mowry:osdi96, author = "Todd C. Mowry, Angela K. Demke and Orran Krieger", title = "Automatic Compiler-Inserted I/O Prefetching for Out-of-Core Applications", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "3--17", abstract = " Current operating systems offer poor performance when a numeri c application's working set does not fit in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme, the compiler provides the crucial information on future access patterns without burdening the programmer, the operating system supports non-binding prefetch and release hints for managing I/O, and the operating sys tem cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively move prefetches back ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We have implemented our scheme using the SUIF compiler and the Hurricane operating system. Our experimental results demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the eight applications, with two applications speeding up by threefold or more. " } @InProceedings{Kimbrel:osdi96, author = "Tracy Kimbrel, Andrew Tomkins, R. Hugo Patterson, Brian Bershad, Pei Cao, Edward Felten, Garth Gibson, Anna R. Karlin and Kai Li", title = "A Trace-Driven Comparison of Algorithms for Parallel Prefetching and Caching", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "19--34", abstract = " High-performance I/O systems depend on prefetching and caching in order to deliver good performance to applications. These two techniques have generally been considered in isolation, even though there are significant interactions between them; a block prefetched too early reduces the effectiveness of the cache, while a block cached too long reduces the effectiveness of prefetching. In this paper we study the effects of several combined prefetching and caching strategies for systems with multiple disks. Using disk-accurate trace-driven simulation, we explore the performance characteristics of each of the algorithms in cases in which applications provide full advance knowledge of accesses using hints. Some of the strategies have been published with theoretical performance bounds, and some are components of systems that have been built. One is a new algorithm that combines the desirable characteristics of the others. We find that when performance is limited by I/O stalls, aggressive prefetching helps to alleviate the problem; that more conservative prefetching is appropriate when significant I/O stalls are not present; and that a single, simple strategy is capable of doing both. " } @InProceedings{Sarkar:osdi96, author = "Prasenjit Sarkar and John Hartman", title = "Efficient Cooperative Caching using Hints", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "35--46", abstract = " We present a very low-overhead decentralized algorithm for cooperative caching that provides performance comparable to that of existing centralized algorithms. Unlike existing algorithms that rely on centralized control of cache functions, our algorithm uses hints (i.e. inexact information) to allow clients to perform these functions in a decentralized fashion. This paper shows that a hint-based system performs as well as a more tightly coordinated system while requiring less overhead. Simulations show that the block access times of our system are as good as those of the existing tightly-coordinated algorithms, while reducing manager load by more than a factor of 15, block lookup traffic by nearly a factor of two-thirds, and replacement traffic by more than a factor of 5. " } @InProceedings{Perkovic:osdi96, author = "Dejan Perkovic and Peter J. Keleher", title = "Online Data-Race Detection via Coherency Guarantees", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "47--57", abstract = " We present the design and evaluation of an on-the-fly data-race-detection technique that handles applications written for the lazy release consistent (LRC) shared memory model. We require no explicit association between synchronization and shared memory. Hence, shared accesses have to be tracked and compared at the minimum granularity of data accesses, which is typically a single word. The novel aspect of this system is that we are able to leverage information used to support the underlying memory abstraction to perform on-the-fly data-race detection, without compiler support. Our system consists of a minimally modified version of the CVM distributed shared memory system, and instrumentation code inserted by the ATOM code re-writer. We present an experimental evaluation of our technique by using our system to look for data races in four unaltered programs. Our system correctly found read-write data races in a program that allows unsynchronized read access to a global tour bound, and a write-write race in a program from a standard benchmark suite. Overall, our mechanism reduced program performance by approximately a factor of two. " } @InProceedings{Costa:osdi96, author = "Manuel Costa, Paulo Guedes, Manuel Sequeira, Nuno Neves, Miguel Castro", title = "Lightweight Logging for Lazy Release Consistent Distributed Shared Memory", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "59--73", abstract = " This paper presents a new logging and recovery algorithm for lazy release consistent distributed shared memory (DSM). The new algorithm tolerates single node failures by maintaining a distributed log of data dependencies in the volatile memory of processes. The algorithm adds very little overhead to the memory consistency protocol: it sends no additional messages during failure-free periods; it adds only a minimal amount of data to one of the DSM protocol messages; it introduces no forced rollbacks of non-faulty processes; and it performs no communication-induced accesses to stable storage. Furthermore, the algorithm logs only a very small amount of data, because it uses the log of memory accesses already maintained by the memory consistency protocol. The algorithm was implemented in TreadMarks, a state-of-the-art DSM system. Experimental results show that the algorithm has near zero time overhead and very low space overhead during failure-free execution, thus refuting the common belief that logging overhead is necessarily high in recoverable DSM systems. " } @InProceedings{Zhou:osdi96, author = "Yuanyuan Zhou, Liviu Iftode and Kai Li", title = "Performance Evaluation of Two Home-Based Lazy Release Consistency Protocols for Shared Virtual Memory Systems", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "75--88", abstract = " This paper investigates the performance of shared virtual memory protocols on large-scale multicomputers. Using experiments on a 64-node Paragon, we show that the traditional Lazy Release Consistency (LRC) protocol does not scale well, because of the large number of messages it requires, the large amount of memory it consumes for protocol overhead data, and because of the difficulty of garbage collecting that data. To achieve more scalable performance, we introduce and evaluate two new protocols. The first, Home-based LRC (HLRC), is based on the Automatic Update Release Consistency (AURC) protocol. Like AURC, HLRC maintains a home for each page to which all updates are propagated and from which all copies are derived. Unlike AURC, HLRC requires no specialized hardware support. We find that the use of homes provides substantial improvements in performance and scalability over LRC. Our second protocol, called Overlapped Home-based LRC (OHLRC), takes advantage of the communication processor found on each node of the Paragon to offload some of the protocol overhead of HLRC from the critical path followed by the compute processor. We find that OHLRC provides modest improvements over HLRC. We also apply overlapping to the base LRC protocol, with similar results. Our experiments were done using five of the Splash-2 benchmarks. We report overall execution times, as well as detailed breakdowns of elapsed time, message traffic, and memory use for each of the protocols. " } @InProceedings{Ford:osdi96, author = "Bryan Ford and Sai Susarla", title = "CPU Inheritance Scheduling", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "91--105", abstract = " Traditional processor scheduling mechanisms in operating systems are fairly rigid, often supporting only one fixed scheduling policy, or, at most, a few ``scheduling classes'' whose implementations are closely tied together in the OS kernel. This paper presents CPU inheritance scheduling, a novel processor scheduling framework in which arbitrary threads can act as schedulers for other threads. Widely different scheduling policies can be implemented under the framework, and many different policies can coexist in a single system, providing much greater scheduling flexibility. Modular, hierarchical control can be provided over the processor utilization of arbitrary administrative domains, such as processes, jobs, users, and groups, and the CPU resources consumed can be accounted for and attributed accurately. Applications, as well as the OS, can implement customized local scheduling policies; the framework ensures that all the different policies work together logically and predictably. As a side effect, the framework also cleanly addresses priority inversion by providing a generalized form of priority inheritance that automatically works within and among diverse scheduling policies. CPU inheritance scheduling extends naturally to multiprocessors, and supports processor management techniques such as processor affinity[29] and scheduler activations[3]. We show that this flexibility can be provided with acceptable overhead in typical environments, depending on factors such as context switch speed and frequency. " } @InProceedings{Goyal:osdi96, author = "Pawan Goyal, Xingang Guo and Harrick M. Vin", title = "A Hierarchical CPU Scheduler for Multimedia Operating Systems", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "107--121", abstract = " The need for supporting variety of hard and soft real-time, as well as best effort applications in a multimedia computing environment requires an operating system framework that: (1) enables different schedulers to be employed for different application classes, and (2) provides protection between the various classes of applications. We argue that these objectives can be achieved by hierarchical partitioning of CPU bandwidth, in which an operating system partitions the CPU bandwidth among various application classes, and each application class, in turn, partitions its allocation (potentially using a different scheduling algorithm) among its sub-classes or applications. We present Start-time Fair Queuing (SFQ) algorithm, which enables such hierarchical partitioning. We have implemented a hierarchical scheduler in Solaris 2.4. We describe our implementation, and demonstrate its suitability for multimedia operating systems. " } @InProceedings{Greenwald:osdi96, author = "Michael Greenwald and David Cheriton", title = "The Synergy Between Non-blocking Synchronization and Operating System Structure", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "123--136", abstract = " Non-blocking synchronization has significant advantages over blocking synchronization: however, it has not been used to a significant degree in practice. We designed and implemented a multiprocessor operating system kernel and run-time library for high-performance, reliability and modularity. We used non-blocking synchronization, not because it was an objective in itself, but because it became the approach of choice. It was an attractive approach because of the synergy between other structuring techniques we used to achieve our primary goals and the benefits of non-blocking synchronization. This paper describes this synergy: the structuring techniques we used which facilitated non-blocking synchronization and our experience with this implementation. " } @InProceedings{Ford:osdi96A, author = "Bryan Ford, Mike Hibler, Jay Lepreau, Patrick Tullmann, Godmar Back and Stephen Clawson", title = "Microkernels Meet Recursive Virtual Machines", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "137--151", abstract = " This paper describes a novel approach to providing modular and extensible operating system functionality and encapsulated environments based on a synthesis of microkernel and virtual machine concepts. We have developed a software-based virtualizable architecture called Fluke that allows recursive virtual machines (virtual machines running on other virtual machines) to be implemented efficiently by a microkernel running on generic hardware. A complete virtual machine interface is provided at each level; efficiency derives from needing to implement only new functionality at each level. This infrastructure allows common OS functionality, such as process management, demand paging, fault tolerance, and debugging support, to be provided by cleanly modularized, independent, stackable virtual machine monitors, implemented as user processes. It can also provide uncommon or unique OS features, including the above features specialized for particular applications' needs, virtual machines transparently distributed cross-node, or security monitors that allow arbitrary untrusted binaries to be executed safely. Our prototype implementation of this model indicates that it is practical to modularize operating systems this way. Some types of virtual machine layers impose almost no overhead at all, while others impose some overhead (typically 0--35%), but only on certain classes of applications. " } @InProceedings{Mosberger:osdi96, author = "David Mosberger and Larry L. Peterson", title = "Making Paths Explicit in the Scout Operating System", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "153--167", abstract = " This paper makes a case for paths as an explicit abstraction in operating system design. Paths provide a unifying infrastructure for several OS mechanisms that have been introduced in the last several years, including fbufs, integrated layer processing, packet classifers, code specialization, and migrating threads. This paper articulates the potential advantages of a path-based OS structure, describes the specific path architecture implemented in the Scout OS, and demonstrates the advantages in a particular application domain--receiving, decoding, and displaying MPEG-compressed video. " } @InProceedings{Perl:osdi96, author = "Sharon E. Perl and Richard L. Sites", title = "Studies of Windows NT Performance using Dynamic Execution Traces", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "169--183", abstract = " We studied two aspects of the performance of Windows NTtm : processor bandwidth requirements for memory accesses in a uniprocessor system running commercial and benchmark applications, and locking behavior of a commercial database on a small-scale multiprocessor. Our studies are based on full dynamic execution traces of the systems, which include all instructions executed by the operating system and applications over periods of a few seconds (enough time to allow for significant computation). The traces were obtained on Alpha PCs, using a new software tool called PatchWrx that takes advantage of the Alpha architecture's PAL-code layer to implement efficient, comprehensive system tracing. Because the Alpha version of Windows NT uses substantially the same code base as other versions, and therefore executes nearly the same sequence of calls, basic blocks, and data structure accesses, we believe our conclusions are relevant for non-Alpha systems as well. This paper describes our performance studies and interesting aspects of PatchWrx. We conclude from our studies that processor bandwidth can be a first-order bottleneck to achieving good performance. This is particularly apparent when studying commercial benchmarks. Operating system code and data structures contribute disproportionately to the memory access load. We also found that operating system software lock contention was a factor preventing the database benchmark from scaling up on the small multiprocessor, and that the cache coherence protocol employed by the machine introduced more cache interference than necessary. " } @InProceedings{Endo:osdi96, author = "Yasuhiro Endo, Zheng Wang, J. Bradley Chen, and Margo Seltzer", title = "Using Latency to Evaluate Interactive System Performance", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "185--199", abstract = " The conventional methodology for system performance measurement, which relies primarily on throughput-sensitive benchmarks and throughput metrics, has major limitations when analyzing the behavior and performance of interactive workloads. The increasingly interactive character of personal computing demands new ways of measuring and analyzing system performance. In this paper, we present a combination of measurement techniques and benchmark methodologies that address these problems. We introduce several simple methods for making direct and precise measurements of event handling latency in the context of a realistic interactive application. We analyze how results from such measurements can be used to understand the detailed behavior of latency-critical events. We demonstrate our techniques in an analysis of the performance of two releases of Windows NT and Windows 95. Our experience indicates that latency can be measured for a class of interactive workloads, providing a substantial improvement in the accuracy and detail of performance information over measurements based strictly on throughput. " } @InProceedings{Pardyak:osdi96, author = "Przemyslaw Pardyak and Brian Bershad", title = "Dynamic Binding for an Extensible System", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "201--212", abstract = " An extensible system requires a means to dynamically bind extensions into executing code. SPIN extensible operating system uses an event-based invocation mechanism to provide this functionality in a flexible, transparent, safe, and efficient way. Events offer a uniform model of extensibility, whereby the system's configuration can change without changing any of its components. Events are defined with the granularity and syntax of procedures but provide extended procedure call semantics such as conditional execution, multicast, and asynchrony. By installing a handler on an event, an extension's code can execute in response to activities at the granularity of procedure call. Our system uses runtime code generation to ensure that event delivery has low overhead and scales well with the number of handlers. This paper describes the design, use and performance of events in the SPIN operating system. " } @InProceedings{Seltzer:osdi96, author = "Margo I. Seltzer, Yasuhiro Endo, Christopher Small, Keith A. Smith", title = "Dealing With Disaster: Surviving Misbehaved Kernel Extensions", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "213--227", abstract = " Today's extensible operating systems allow applications to modify kernel behavior by providing mechanisms for application code to run in the kernel address space. The advantage of this approach is that it provides improved application flexibility and performance; the disadvantage is that buggy or malicious code can jeopardize the integrity of the kernel. It has been demonstrated that it is feasible to use safe languages, software fault isolation, or virtual memory protection to safeguard the main kernel. However, such protection mechanisms do not address the full range of problems, such as resource hoarding, that can arise when application code is introduced into the kernel. In this paper, we present an analysis of extension mechanisms in the VINO kernel. VINO uses software fault isolation as its safety mechanism and a lightweight transaction system to cope with resource-hoarding. We explain how these two mechanisms are sufficient to protect against a large class of errant or malicious extensions, and we quantify the overhead that this protection introduces. We find that while the overhead of these techniques is high relative to the cost of the extensions themselves, it is low relative to the benefits that extensibility brings. " } @InProceedings{Necula:osdi96, author = "George C. Necula and Peter Lee", title = "Safe Kernel Extensions Without Run-Time Checking", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "229--243", abstract = " This paper describes a mechanism by which an operating system kernel can determine with certainty that it is safe to execute a binary supplied by an untrusted source. The kernel first defines a safety policy and makes it public. Then, using this policy, an application can provide binaries in a special form called proof-carrying code, or simply PCC. Each binary contains, in addition to the native code, a formal proof that the code obeys the safety policy. The kernel can easily validate the proof without using cryptography and without consulting any external trusted entities. If the validation succeeds, the code is guaranteed to respect the safety policy without relying on run-time checks. The main practical difficulty of is in generating the safety proofs. In order to gain some preliminary experience with this, we have written several network packet filters in hand-tuned DEC Alpha assembly language, and then generated binaries for them using a special prototype assembler. The binaries can be executed with no run-time overhead, beyond a one-time cost of 1 to 3 milliseconds for validating the enclosed proofs. The net result is that our packet filters are formally guaranteed to be safe and are faster than packet filters created using Berkeley Packet Filters, Software Fault Isolation, or safe languages such as Modula-3. " } @InProceedings{Buzzard:osdi96, author = "Greg Buzzard, David Jacobson, Milon Mackey, Scott Marovich, and John Wilkes", title = "An implementation of the Hamlyn sender-managed interface architecture", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "245--259", abstract = " As the latency and bandwidth of multicomputer interconnection fabrics improve, there is a growing need for an interface between them and host processors that does not hide these gains behind software overhead. The Hamlyn interface architecture does this. It uses sender-based memory management to eliminate receiver buffer overruns, provides applications with direct hardware access to minimize latency, supports adaptive routing networks to allow higher throughput, and offers full protection between applications so that it can be used in a general-purpose computing environment. To test these claims we built a prototype Hamlyn interface for a Myrinet network connected to a standard HP workstation and report here on its design and performance. Our interface delivers an application-to- application round trip time of 28us for short messages and a one way time of 17.4us + 32.6ns/byte (30.7MB/s) for longer ones, while requiring fewer cpu cycles than an aggressive implementation of Active Messages on the CM-5. " } @InProceedings{Druschel:osdi96, author = "Peter Druschel and Gaurav Banga", title = "Lazy Receiver Processing (LRP): A Network Subsystem Architecture for Server Systems", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "261--275", abstract = " The explosive growth of the Internet, the widespread use of WWW-related applications, and the increased reliance on client-server architectures places interesting new demands on network servers. In particular, the operating system running on such systems needs to manage the machine's resources in a manner that maximizes and maintains throughput under conditions of high load. We propose and evaluate a new network subsystem architecture that provides improved fairness, stability, and increased throughput under high network load. The architecture is hardware independent and does not degrade network latency or bandwidth under normal load conditions. " } @InProceedings{Brustoloni:osdi96, author = "Jose' Carlos Brustoloni and Peter Steenkiste", title = "Effects of Buffering Semantics on I/O Performance", booktitle = OSDI96, year = 1996, month = oct, address = "Seattle, WA", organization = "USENIX Assoc.", pages = "277--291", abstract = " We present a novel taxonomy that characterizes in a structured way the software and hardware tradeoffs for I/O data passing between applications and operating system. This work contributes new techniques, input-disabled pageout, transient output copy-on-write, and input alignment, that are used for copy avoidance in an optimized buffering semantics, emulated copy. Emulated copy offers the same API and integrity guarantees as those of copy semantics and, therefore, can transparently replace it. We implemented an I/O framework, Genie, that allows applications to select any semantics in the taxonomy. Using Genie for communication between PCs and AlphaStations over an ATM network at 155 Mbps, we found that all non-copy semantics performed similarly, and that only copy semantics had distinctly inferior performance. We analyzed end-to-end latencies in terms of the costs of primitive data passing operations and modeled how those costs scale with CPU, memory, and network speeds. The analysis suggests that current trends tend to intensify the observed performance clustering. The main conclusion is that existing I/O interfaces with copy semantics, such as that of Unix, can be transparently converted to emulated copy semantics and thus achieve performance comparable to the best obtainable with any semantics in the taxonomy. " }