@string{OSDI96 = "Proc. of the Second Symposium on Operating Systems Design and Implementation"}

@InProceedings{Mowry:osdi96,
  author = "Todd C. Mowry, Angela K. Demke and Orran Krieger",
  title = "Automatic Compiler-Inserted I/O Prefetching for Out-of-Core Applications",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "3--17",
  abstract = "
  Current operating systems offer poor performance when a numeri c application's working set does not fit
  in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are
  typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g.,
  read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the
  programmer from this task, provides high performance, and requires only minimal changes to current
  operating systems. In our scheme, the compiler provides the crucial information on future access patterns
  without burdening the programmer, the operating system supports non-binding prefetch and release
  hints for managing I/O, and the operating sys tem cooperates with a run-time layer to accelerate
  performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains
  the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to
  aggressively move prefetches back ahead of references, and gives the operating system the flexibility to
  arbitrate between the competing resource demands of multiple applications. We have implemented our
  scheme using the SUIF compiler and the Hurricane operating system. Our experimental results
  demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of
  the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the
  eight applications, with two applications speeding up by threefold or more. 
"
}

@InProceedings{Kimbrel:osdi96,
  author = "Tracy Kimbrel, Andrew Tomkins, R. Hugo Patterson, Brian Bershad, Pei Cao, Edward Felten, Garth
  		Gibson, Anna R. Karlin and Kai Li",
  title = "A Trace-Driven Comparison of Algorithms for Parallel
  		Prefetching and Caching",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "19--34",
  abstract = "
  High-performance I/O systems depend on prefetching and caching in order to deliver good performance
  to applications. These two techniques have generally been considered in isolation, even though there are
  significant interactions between them; a block prefetched too early reduces the effectiveness of the cache,
  while a block cached too long reduces the effectiveness of prefetching. In this paper we study the effects
  of several combined prefetching and caching strategies for systems with multiple disks. Using
  disk-accurate trace-driven simulation, we explore the performance characteristics of each of the
  algorithms in cases in which applications provide full advance knowledge of accesses using hints. Some
  of the strategies have been published with theoretical performance bounds, and some are components of
  systems that have been built. One is a new algorithm that combines the desirable characteristics of the
  others. We find that when performance is limited by I/O stalls, aggressive prefetching helps to alleviate
  the problem; that more conservative prefetching is appropriate when significant I/O stalls are not
  present; and that a single, simple strategy is capable of doing both. 
"
}

@InProceedings{Sarkar:osdi96,
  author = "Prasenjit Sarkar and John Hartman",
  title = "Efficient Cooperative Caching using Hints",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "35--46",
  abstract = "
  We present a very low-overhead decentralized algorithm for cooperative caching that provides
  performance comparable to that of existing centralized algorithms. Unlike existing algorithms that rely on
  centralized control of cache functions, our algorithm uses hints (i.e. inexact information) to allow clients
  to perform these functions in a decentralized fashion. This paper shows that a hint-based system
  performs as well as a more tightly coordinated system while requiring less overhead. Simulations show
  that the block access times of our system are as good as those of the existing tightly-coordinated
  algorithms, while reducing manager load by more than a factor of 15, block lookup traffic by nearly a factor
  of two-thirds, and replacement traffic by more than a factor of 5. 
"
}

@InProceedings{Perkovic:osdi96,
  author = "Dejan Perkovic and Peter J. Keleher",
  title = "Online Data-Race Detection via Coherency Guarantees",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "47--57",
  abstract = "
  We present the design and evaluation of an on-the-fly data-race-detection technique that handles
  applications written for the lazy release consistent (LRC) shared memory model. 

  We require no explicit association between synchronization and shared memory. Hence, shared
  accesses have to be tracked and compared at the minimum granularity of data accesses, which is
  typically a single word. 

  The novel aspect of this system is that we are able to leverage information used to support the
  underlying memory abstraction to perform on-the-fly data-race detection, without compiler support. Our
  system consists of a minimally modified version of the CVM distributed shared memory system, and
  instrumentation code inserted by the ATOM code re-writer. 

  We present an experimental evaluation of our technique by using our system to look for data races in four
  unaltered programs. Our system correctly found read-write data races in a program that allows
  unsynchronized read access to a global tour bound, and a write-write race in a program from a standard
  benchmark suite. Overall, our mechanism reduced program performance by approximately a factor of two. 
"
}

@InProceedings{Costa:osdi96,
  author = "Manuel Costa, Paulo Guedes, Manuel Sequeira, Nuno Neves, Miguel Castro",
  title = "Lightweight Logging for Lazy Release Consistent Distributed
  		Shared Memory",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "59--73",
  abstract = "
  This paper presents a new logging and recovery algorithm for lazy release consistent distributed shared
  memory (DSM). The new algorithm tolerates single node failures by maintaining a distributed log of data
  dependencies in the volatile memory of processes. 

  The algorithm adds very little overhead to the memory consistency protocol: it sends no additional
  messages during failure-free periods; it adds only a minimal amount of data to one of the DSM protocol
  messages; it introduces no forced rollbacks of non-faulty processes; and it performs no
  communication-induced accesses to stable storage. Furthermore, the algorithm logs only a very small
  amount of data, because it uses the log of memory accesses already maintained by the memory
  consistency protocol. 

  The algorithm was implemented in TreadMarks, a state-of-the-art DSM system. Experimental results
  show that the algorithm has near zero time overhead and very low space overhead during failure-free
  execution, thus refuting the common belief that logging overhead is necessarily high in recoverable DSM
  systems. 
"
}

@InProceedings{Zhou:osdi96,
  author = "Yuanyuan Zhou, Liviu Iftode and Kai Li",
  title = "Performance Evaluation of Two Home-Based Lazy Release
	  Consistency Protocols for Shared Virtual Memory Systems",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "75--88",
  abstract = "
  This paper investigates the performance of shared virtual memory protocols on large-scale
  multicomputers. Using experiments on a 64-node Paragon, we show that the traditional Lazy Release
  Consistency (LRC) protocol does not scale well, because of the large number of messages it requires,
  the large amount of memory it consumes for protocol overhead data, and because of the difficulty of
  garbage collecting that data. 

  To achieve more scalable performance, we introduce and evaluate two new protocols. The first,
  Home-based LRC (HLRC), is based on the Automatic Update Release Consistency (AURC) protocol.
  Like AURC, HLRC maintains a home for each page to which all updates are propagated and from which
  all copies are derived. Unlike AURC, HLRC requires no specialized hardware support. We find that the
  use of homes provides substantial improvements in performance and scalability over LRC. 

  Our second protocol, called Overlapped Home-based LRC (OHLRC), takes advantage of the
  communication processor found on each node of the Paragon to offload some of the protocol overhead of
  HLRC from the critical path followed by the compute processor. We find that OHLRC provides modest
  improvements over HLRC. We also apply overlapping to the base LRC protocol, with similar results. 

  Our experiments were done using five of the Splash-2 benchmarks. We report overall execution times,
  as well as detailed breakdowns of elapsed time, message traffic, and memory use for each of the
  protocols. 
"
}

@InProceedings{Ford:osdi96,
  author = "Bryan Ford and Sai Susarla",
  title = "CPU Inheritance Scheduling",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "91--105",
  abstract = "
  Traditional processor scheduling mechanisms in operating systems are fairly rigid, often supporting only
  one fixed scheduling policy, or, at most, a few ``scheduling classes'' whose implementations are closely
  tied together in the OS kernel. This paper presents CPU inheritance scheduling, a novel processor
  scheduling framework in which arbitrary threads can act as schedulers for other threads. Widely different
  scheduling policies can be implemented under the framework, and many different policies can coexist in a
  single system, providing much greater scheduling flexibility. Modular, hierarchical control can be provided
  over the processor utilization of arbitrary administrative domains, such as processes, jobs, users, and
  groups, and the CPU resources consumed can be accounted for and attributed accurately. Applications,
  as well as the OS, can implement customized local scheduling policies; the framework ensures that all
  the different policies work together logically and predictably. As a side effect, the framework also cleanly
  addresses priority inversion by providing a generalized form of priority inheritance that automatically
  works within and among diverse scheduling policies. CPU inheritance scheduling extends naturally to
  multiprocessors, and supports processor management techniques such as processor affinity[29] and
  scheduler activations[3]. We show that this flexibility can be provided with acceptable overhead in
  typical environments, depending on factors such as context switch speed and frequency. 
"
}

@InProceedings{Goyal:osdi96,
  author = "Pawan Goyal, Xingang Guo and Harrick M. Vin",
  title = "A Hierarchical CPU Scheduler for Multimedia Operating Systems",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "107--121",
  abstract = "
  The need for supporting variety of hard and soft real-time, as well as best effort applications in a
  multimedia computing environment requires an operating system framework that: (1) enables different
  schedulers to be employed for different application classes, and (2) provides protection between the
  various classes of applications. We argue that these objectives can be achieved by hierarchical
  partitioning of CPU bandwidth, in which an operating system partitions the CPU bandwidth among
  various application classes, and each application class, in turn, partitions its allocation (potentially using
  a different scheduling algorithm) among its sub-classes or applications. We present Start-time Fair
  Queuing (SFQ) algorithm, which enables such hierarchical partitioning. We have implemented a
  hierarchical scheduler in Solaris 2.4. We describe our implementation, and demonstrate its suitability for
  multimedia operating systems. 
"
}

@InProceedings{Greenwald:osdi96,
  author = "Michael Greenwald and David Cheriton",
  title = "The Synergy Between Non-blocking Synchronization and
  	Operating System Structure",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "123--136",
  abstract = "
  Non-blocking synchronization has significant advantages over blocking synchronization: however, it has
  not been used to a significant degree in practice. We designed and implemented a multiprocessor
  operating system kernel and run-time library for high-performance, reliability and modularity. We used
  non-blocking synchronization, not because it was an objective in itself, but because it became the
  approach of choice. It was an attractive approach because of the synergy between other structuring
  techniques we used to achieve our primary goals and the benefits of non-blocking synchronization. 

  This paper describes this synergy: the structuring techniques we used which facilitated non-blocking
  synchronization and our experience with this implementation. 
"
}

@InProceedings{Ford:osdi96A,
  author = "Bryan Ford, Mike Hibler, Jay Lepreau, Patrick Tullmann, Godmar Back and Stephen Clawson",
  title = "Microkernels Meet Recursive Virtual Machines",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "137--151",
  abstract = "
  This paper describes a novel approach to providing modular and extensible operating system
  functionality and encapsulated environments based on a synthesis of microkernel and virtual machine
  concepts. We have developed a software-based virtualizable architecture called Fluke that allows
  recursive virtual machines (virtual machines running on other virtual machines) to be implemented
  efficiently by a microkernel running on generic hardware. A complete virtual machine interface is provided
  at each level; efficiency derives from needing to implement only new functionality at each level. This
  infrastructure allows common OS functionality, such as process management, demand paging, fault
  tolerance, and debugging support, to be provided by cleanly modularized, independent, stackable virtual
  machine monitors, implemented as user processes. It can also provide uncommon or unique OS features,
  including the above features specialized for particular applications' needs, virtual machines transparently
  distributed cross-node, or security monitors that allow arbitrary untrusted binaries to be executed
  safely. Our prototype implementation of this model indicates that it is practical to modularize operating
  systems this way. Some types of virtual machine layers impose almost no overhead at all, while others
  impose some overhead (typically 0--35%), but only on certain classes of applications. 
"
}

@InProceedings{Mosberger:osdi96,
  author = "David Mosberger and Larry L. Peterson",
  title = "Making Paths Explicit in the Scout Operating System",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "153--167",
  abstract = "
  This paper makes a case for paths as an explicit abstraction in operating system design. Paths provide a
  unifying infrastructure for several OS mechanisms that have been introduced in the last several years,
  including fbufs, integrated layer processing, packet classifers, code specialization, and migrating threads.
  This paper articulates the potential advantages of a path-based OS structure, describes the specific path
  architecture implemented in the Scout OS, and demonstrates the advantages in a particular application
  domain--receiving, decoding, and displaying MPEG-compressed video. 
"
}

@InProceedings{Perl:osdi96,
  author = "Sharon E. Perl and Richard L. Sites",
  title = "Studies of Windows NT Performance using Dynamic Execution
  	Traces",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "169--183",
  abstract = "
  We studied two aspects of the performance of Windows NTtm : processor bandwidth requirements for
  memory accesses in a uniprocessor system running commercial and benchmark applications, and locking
  behavior of a commercial database on a small-scale multiprocessor. Our studies are based on full
  dynamic execution traces of the systems, which include all instructions executed by the operating system
  and applications over periods of a few seconds (enough time to allow for significant computation). The
  traces were obtained on Alpha PCs, using a new software tool called PatchWrx that takes advantage of
  the Alpha architecture's PAL-code layer to implement efficient, comprehensive system tracing. Because
  the Alpha version of Windows NT uses substantially the same code base as other versions, and
  therefore executes nearly the same sequence of calls, basic blocks, and data structure accesses, we
  believe our conclusions are relevant for non-Alpha systems as well. This paper describes our
  performance studies and interesting aspects of PatchWrx. 

  We conclude from our studies that processor bandwidth can be a first-order bottleneck to achieving good
  performance. This is particularly apparent when studying commercial benchmarks. Operating system
  code and data structures contribute disproportionately to the memory access load. We also found that
  operating system software lock contention was a factor preventing the database benchmark from scaling
  up on the small multiprocessor, and that the cache coherence protocol employed by the machine
  introduced more cache interference than necessary. 
"
}

@InProceedings{Endo:osdi96,
  author = "Yasuhiro Endo, Zheng Wang, J. Bradley Chen, and Margo Seltzer",
  title = "Using Latency to Evaluate Interactive System Performance",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "185--199",
  abstract = "
  The conventional methodology for system performance measurement, which relies primarily on
  throughput-sensitive benchmarks and throughput metrics, has major limitations when analyzing the
  behavior and performance of interactive workloads. The increasingly interactive character of personal
  computing demands new ways of measuring and analyzing system performance. In this paper, we
  present a combination of measurement techniques and benchmark methodologies that address these
  problems. We introduce several simple methods for making direct and precise measurements of event
  handling latency in the context of a realistic interactive application. We analyze how results from such
  measurements can be used to understand the detailed behavior of latency-critical events. We
  demonstrate our techniques in an analysis of the performance of two releases of Windows NT and
  Windows 95. Our experience indicates that latency can be measured for a class of interactive workloads,
  providing a substantial improvement in the accuracy and detail of performance information over
  measurements based strictly on throughput. 
"
}

@InProceedings{Pardyak:osdi96,
  author = "Przemyslaw Pardyak and Brian Bershad",
  title = "Dynamic Binding for an Extensible System",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "201--212",
  abstract = "
  An extensible system requires a means to dynamically bind extensions into executing code. SPIN
  extensible operating system uses an event-based invocation mechanism to provide this functionality in
  a flexible, transparent, safe, and efficient way. Events offer a uniform model of extensibility, whereby the
  system's configuration can change without changing any of its components. Events are defined with the
  granularity and syntax of procedures but provide extended procedure call semantics such as conditional
  execution, multicast, and asynchrony. By installing a handler on an event, an extension's code can
  execute in response to activities at the granularity of procedure call. Our system uses runtime code
  generation to ensure that event delivery has low overhead and scales well with the number of handlers.
  This paper describes the design, use and performance of events in the SPIN operating system.
"
}

@InProceedings{Seltzer:osdi96,
  author = "Margo I. Seltzer, Yasuhiro Endo, Christopher Small, Keith A. Smith",
  title = "Dealing With Disaster: Surviving Misbehaved Kernel Extensions",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "213--227",
  abstract = "
  Today's extensible operating systems allow applications to modify kernel behavior by providing
  mechanisms for application code to run in the kernel address space. The advantage of this approach is
  that it provides improved application flexibility and performance; the disadvantage is that buggy or
  malicious code can jeopardize the integrity of the kernel. It has been demonstrated that it is feasible to
  use safe languages, software fault isolation, or virtual memory protection to safeguard the main kernel.
  However, such protection mechanisms do not address the full range of problems, such as resource
  hoarding, that can arise when application code is introduced into the kernel. 

  In this paper, we present an analysis of extension mechanisms in the VINO kernel. VINO uses software
  fault isolation as its safety mechanism and a lightweight transaction system to cope with
  resource-hoarding. We explain how these two mechanisms are sufficient to protect against a large class
  of errant or malicious extensions, and we quantify the overhead that this protection introduces. 

  We find that while the overhead of these techniques is high relative to the cost of the extensions
  themselves, it is low relative to the benefits that extensibility brings. 
"
}

@InProceedings{Necula:osdi96,
  author = "George C. Necula and Peter Lee",
  title = "Safe Kernel Extensions Without Run-Time Checking",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "229--243",
  abstract = "
  This paper describes a mechanism by which an operating system kernel can determine with certainty
  that it is safe to execute a binary supplied by an untrusted source. The kernel first defines a safety policy
  and makes it public. Then, using this policy, an application can provide binaries in a special form called
  proof-carrying code, or simply PCC. Each binary contains, in addition to the native code, a formal proof
  that the code obeys the safety policy. The kernel can easily validate the proof without using cryptography
  and without consulting any external trusted entities. If the validation succeeds, the code is guaranteed to
  respect the safety policy without relying on run-time checks. 

  The main practical difficulty of is in generating the safety proofs. In order to gain some preliminary
  experience with this, we have written several network packet filters in hand-tuned DEC Alpha
  assembly language, and then generated binaries for them using a special prototype assembler. The
  binaries can be executed with no run-time overhead, beyond a one-time cost of 1 to 3 milliseconds for
  validating the enclosed proofs. The net result is that our packet filters are formally guaranteed to be safe
  and are faster than packet filters created using Berkeley Packet Filters, Software Fault Isolation, or safe
  languages such as Modula-3. 
"
}

@InProceedings{Buzzard:osdi96,
  author = "Greg Buzzard, David Jacobson, Milon Mackey, Scott Marovich, and John Wilkes",
  title = "An implementation of the Hamlyn sender-managed interface
  	architecture",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "245--259",
  abstract = "
  As the latency and bandwidth of multicomputer interconnection fabrics improve, there is a growing need
  for an interface between them and host processors that does not hide these gains behind software
  overhead. The Hamlyn interface architecture does this. It uses sender-based memory management to
  eliminate receiver buffer overruns, provides applications with direct hardware access to minimize latency,
  supports adaptive routing networks to allow higher throughput, and offers full protection between
  applications so that it can be used in a general-purpose computing environment. To test these claims we
  built a prototype Hamlyn interface for a Myrinet network connected to a standard HP workstation and
  report here on its design and performance. Our interface delivers an application-to- application round
  trip time of 28us for short messages and a one way time of 17.4us + 32.6ns/byte (30.7MB/s) for longer
  ones, while requiring fewer cpu cycles than an aggressive implementation of Active Messages on the
  CM-5. 
"
}

@InProceedings{Druschel:osdi96,
  author = "Peter Druschel and Gaurav Banga",
  title = "Lazy Receiver Processing (LRP): A Network Subsystem
  	Architecture for Server Systems",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "261--275",
  abstract = "
  The explosive growth of the Internet, the widespread use of WWW-related applications, and the
  increased reliance on client-server architectures places interesting new demands on network servers. In
  particular, the operating system running on such systems needs to manage the machine's resources in a
  manner that maximizes and maintains throughput under conditions of high load. We propose and
  evaluate a new network subsystem architecture that provides improved fairness, stability, and increased
  throughput under high network load. The architecture is hardware independent and does not degrade
  network latency or bandwidth under normal load conditions. 
"
}

@InProceedings{Brustoloni:osdi96,
  author = "Jose' Carlos Brustoloni and Peter Steenkiste",
  title = "Effects of Buffering Semantics on I/O Performance",
  booktitle = OSDI96,
  year = 1996,
  month = oct,
  address = "Seattle, WA",
  organization = "USENIX Assoc.",
  pages = "277--291",
  abstract = "
  We present a novel taxonomy that characterizes in a structured way the software and hardware
  tradeoffs for I/O data passing between applications and operating system. This work contributes new
  techniques, input-disabled pageout, transient output copy-on-write, and input alignment, that are used
  for copy avoidance in an optimized buffering semantics, emulated copy. Emulated copy offers the same
  API and integrity guarantees as those of copy semantics and, therefore, can transparently replace it. We
  implemented an I/O framework, Genie, that allows applications to select any semantics in the taxonomy.
  Using Genie for communication between PCs and AlphaStations over an ATM network at 155 Mbps, we
  found that all non-copy semantics performed similarly, and that only copy semantics had distinctly
  inferior performance. We analyzed end-to-end latencies in terms of the costs of primitive data passing
  operations and modeled how those costs scale with CPU, memory, and network speeds. The analysis
  suggests that current trends tend to intensify the observed performance clustering. The main conclusion
  is that existing I/O interfaces with copy semantics, such as that of Unix, can be transparently converted
  to emulated copy semantics and thus achieve performance comparable to the best obtainable with any
  semantics in the taxonomy. 
"
}