publications.html

<!DOCTYPE html>

<html lang="en-us">
  <head>
    <!-- Required meta tags -->
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no, user-scalable=no">

    <!-- Font Awesome for social media icons -->
    <script src="https://kit.fontawesome.com/791291c78f.js" crossorigin="anonymous"></script>

    <!-- Bootstrap CSS -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">

    <!-- Site Information -->
    <title> SAIL@Princeton </title>

    <style type="text/css">
      .smlinks {
        color: black;
      }
      .smlinks:hover {
        color: rgb(7, 107, 255);
      }
      .paper-item {
        margin-bottom: 15px; /* Adjust this value to increase/decrease the space */
      }
      .badge.badge-secondary {
        cursor: pointer;
      }
    </style>

    <!-- Favicon -->
    <!-- TODO(ruipan): we could add a favicon of the website here -->
    <!-- https://realfavicongenerator.net/ -->
    <!-- <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
    <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
    <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
    <link rel="manifest" href="/site.webmanifest">
    <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5">
    <meta name="msapplication-TileColor" content="#da532c">
    <meta name="theme-color" content="#ffffff"> -->
    
    <!-- Functionality for searching papers -->
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    <script>
      $(document).ready(function () {
          // Function to get URL parameters
          function getQueryParam(name) {
              let urlParams = new URLSearchParams(window.location.search);
              return urlParams.get(name) || "";
          }
    
          // Function to perform search
          function filterPapers(query) {
              query = query.toLowerCase();
              $(".paper-item").each(function () {
                  let text = $(this).text().toLowerCase();
                  $(this).closest("li").toggle(text.includes(query));
              });
          }
    
          // Populate search bar and apply filter if "search" parameter exists
          let searchQuery = getQueryParam("search");
          if (searchQuery) {
              $("#search").val(searchQuery);
              filterPapers(searchQuery); // Directly apply the filter
          }
    
          // Attach event listener for manual searches
          $("#search").on("input", function () {
              filterPapers($(this).val());
          });

          // Add click event to badge elements
          $(".badge.badge-secondary").on("click", function () {
              let keyword = $(this).text().trim();
              $("#search").val(keyword).trigger("input"); // Update search bar and trigger filtering
          });

          // Clear search when "Clear" button is clicked
          $("#clear-search").on("click", function () {
              $("#search").val("").trigger("input"); // Clear input and reset filter
          });
      });
    </script>
  </head>

  <body>
    <!-- Nav Bar -->
    <!-- TODO(ruipan): figure out how to align the nav items to the right rather than the left -->
    <nav class="navbar navbar-expand-lg navbar-light sticky-top navbar-custom" style="background-color: #f58025">
        <a class="navbar-brand" href="index.html">
          <img src="./images/princeton_square.jpg" width="30" height="30" class="d-inline-block align-top">
          SAIL@Princeton
        </a>
      <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
        <span class="navbar-toggler-icon"></span>
      </button>

      <div class="collapse navbar-collapse" id="navbarSupportedContent">
        <ul class="navbar-nav mr-auto">
          <li class="nav-item" data-toggle="collapse" data-target=".navbar-collapse.show">
            <a class="nav-link" href="index.html#projects">Projects</a>
          </li>
          <li class="nav-item" data-toggle="collapse" data-target=".navbar-collapse.show">
            <a class="nav-link" href="people.html">People</a>
          </li>
          <li class="nav-item" data-toggle="collapse" data-target=".navbar-collapse.show">
            <a class="nav-link" href="publications.html">Publications</a>
          </li>
        </ul>
      </div>
    </nav>

    <!-- Jumbotron -->
    <div class="jumbotron jumbotron-fluid text-center">
      <div class="container">
        <div class="row align-items-center">
          <div class="col-sm-12">
            <h2 class="jumbotron-heading">Publications of SAIL@Princeton</h2>
            <p class="lead">Our publications showcase cutting-edge research at the intersection of systems and machine learning, 
              advancing efficient, scalable, and secure AI/ML systems. From novel models and algorithms to optimized runtime systems for training and inference, 
              our work pushes the boundaries of next-generation AI infrastructure. Explore our latest contributions to AI/ML and systems research below.</p>
          </div>
        </div>
      </div>
    </div>

    <!-- Search bar -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <div class="d-flex mb-3">
              <input type="text" id="search" class="form-control" placeholder="Search by title, author, or keyword..." style="flex: 1;">
              <button id="clear-search" class="btn btn-outline-secondary ml-2">Reset</button>
          </div>
        </div>
      </div>
    </div>


    <!-- Preprints -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <h3>Preprints</h3>
          <ul>
            <li class="paper-item">
              <h5>How to Train Long-Context Language Models (Effectively)</h5> 
              Tianyu Gao*, Alexander Wettig*, Howard Yen, Danqi Chen <br>
              arXiv 2025<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://arxiv.org/abs/2410.02660" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#prolong-abstract" role="button" aria-expanded="false" aria-controls="prolong-abstract">Abstract</a>
              </div>
              <div class="collapse" id="prolong-abstract">
                <div class="card card-body">
                  We study continued training and supervised fine-tuning (SFT) of a language model (LM) to make effective use of long-context information. 
                  We first establish a reliable evaluation protocol to guide model development -- Instead of perplexity or simple needle-in-a-haystack (NIAH) tests, 
                  we use a broad set of long-context tasks, and we evaluate models after SFT with instruction data as this better reveals long-context abilities. 
                  Supported by our robust evaluations, we run thorough experiments to decide the data mix for continued pre-training, the instruction tuning dataset, 
                  and many other design choices. We find that (1) code repositories and books are excellent sources of long data, but it is crucial to combine them with high-quality short data; 
                  (2) training with a sequence length beyond the evaluation length boosts long-context performance; 
                  (3) for SFT, using only short instruction datasets yields strong performance on long-context tasks. 
                  Our final model, ProLong-8B, which is initialized from Llama-3 and trained on 40B tokens, demonstrates state-of-the-art long-context performance among similarly sized models at a length of 128K. 
                  ProLong outperforms Llama-3.18B-Instruct on the majority of long-context tasks despite having seen only 5% as many tokens during long-context training. 
                  Additionally, ProLong can effectively process up to 512K tokens, one of the longest context windows of publicly available LMs.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Certifiably Robust RAG against Retrieval Corruption</h5> 
              Chong Xiang*, Tong Wu*, Zexuan Zhong, David Wagner, Danqi Chen, Prateek Mittal. <br>
              arXiv 2025<br>
              <span class="badge badge-secondary">Compound AI Systems</span>
              <div class="mt-2">
                <a href="https://arxiv.org/abs/2405.15556" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#robustrag-abstract" role="button" aria-expanded="false" aria-controls="robustrag-abstract">Abstract</a>
              </div>
              <div class="collapse" id="robustrag-abstract">
                <div class="card card-body">
                  Retrieval-augmented generation (RAG) has been shown vulnerable to retrieval corruption attacks: an attacker can inject malicious passages into retrieval results to induce inaccurate responses. 
                  In this paper, we propose RobustRAG as the first defense framework against retrieval corruption attacks. 
                  The key insight of RobustRAG is an isolate-then-aggregate strategy: we get LLM responses from each passage in isolation and then securely aggregate these isolated responses. 
                  To instantiate RobustRAG, we design keyword-based and decoding-based algorithms for securely aggregating unstructured text responses. 
                  Notably, RobustRAG can achieve certifiable robustness: we can formally prove and certify that, for certain queries, RobustRAG can always return accurate responses, 
                  even when the attacker has full knowledge of our defense and can arbitrarily inject a small number of malicious passages. We evaluate RobustRAG on open-domain QA and long-form text generation datasets and demonstrate its effectiveness and generalizability across various tasks and datasets.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>RAGServe: Fast Quality-Aware RAG Systems with Configuration Adaptation</h5> 
              Siddhant Ray, Rui Pan, Zhuohan Gu, Kuntai Du, Ganesh Ananthanarayanan, Ravi Netravali, Junchen Jiang <br>
              arXiv 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <span class="badge badge-secondary">Compound AI Systems</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2412.10543" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#ragserve-abstract" role="button" aria-expanded="false" aria-controls="ragserve-abstract">Abstract</a>
              </div>
              <div class="collapse" id="ragserve-abstract">
                <div class="card card-body">
                  RAG (Retrieval Augmented Generation) allows LLMs (large language models) to 
                  generate better responses with external knowledge, but using more external 
                  knowledge often improves generation quality at the expense of response delay. 
                  Prior work either reduces the response delay (through better scheduling of RAG 
                  queries) or strives to maximize quality (which involves tuning the RAG workflow), 
                  but they fall short in optimizing the \emph {tradeoff} between the delay 
                  and quality of RAG responses. This paper presents RAGServe, the first RAG system 
                  that jointly schedules queries and adapts the key RAG configurations of each 
                  job, such as the number of retrieved text chunks and synthesis methods, 
                  in order to balance quality optimization and response delay reduction. 
                  Using 4 popular RAG-QA datasets, we show that compared with the state-of-the-art 
                  RAG scheduling system, RAGServe reduces the generation latency by 1.64--2.54×
                  without sacrificing generation quality.
                </div>
              </div>
            </li>
          </ul>
        </div>
      </div>
    </div>

    <!-- 2025 -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <h3>2025</h3>
          <ul>
            <li class="paper-item">
              <h5>LinGen: Towards High-Resolution Minute-Length Text-to-Video Generation with Linear Computational Complexity</h5> 
              Hongjie Wang, Chih-Yao Ma, Yen-Cheng Liu, Ji Hou, Tao Xu, Jialiang Wang, Felix Juefei-Xu, Yaqiao Luo, Peizhao Zhang, Tingbo Hou, Peter Vajda, Niraj K Jha, Xiaoliang Dai <br>
              CVPR 2025<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2412.09856" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#lingen-abstract" role="button" aria-expanded="false" aria-controls="lingen-abstract">Abstract</a>
                <a href="https://lineargen.github.io/" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Website</button>
                </a>
              </div>
              <div class="collapse" id="lingen-abstract">
                <div class="card card-body">
                  Text-to-video generation enhances content creation but
                  is highly computationally intensive: The computational cost
                  of Diffusion Transformers (DiTs) scales quadratically in the
                  number of pixels. This makes minute-length video generation extremely expensive, limiting most existing models to
                  generating videos of only 10-20 seconds length. We propose a Linear-complexity text-to-video Generation (LinGen) framework whose cost scales linearly in the number
                  of pixels. For the first time, LinGen enables high-resolution
                  minute-length video generation on a single GPU without
                  compromising quality. It replaces the computationallydominant and quadratic-complexity block, self-attention,
                  with a linear-complexity block called MATE, which consists of an MA-branch and a TE-branch. The MA-branch
                  targets short-to-long-range correlations, combining a bidirectional Mamba2 block with our token rearrangement
                  method, Rotary Major Scan, and our review tokens developed for long video generation. The TE-branch is a novel
                  TEmporal Swin Attention block that focuses on temporal
                  correlations between adjacent tokens and medium-range tokens. The MATE block addresses the adjacency preservation issue of Mamba and improves the consistency of generated videos significantly. Experimental results show that
                  LinGen outperforms DiT (with a 75.6% win rate) in video
                  quality with up to 15× (11.5×) FLOPs (latency) reduction.
                  Furthermore, both automatic metrics and human evaluation
                  demonstrate our LinGen-4B yields comparable video quality to state-of-the-art models (with a 50.5%, 52.1%, 49.1%
                  win rate with respect to Gen-3, LumaLabs, and Kling, respectively). This paves the way to hour-length movie generation and real-time interactive video generation. We provide 68s video generation results and more examples in our
                  project website: https://lineargen.github.io/.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Marconi: Prefix Caching for the Era of Hybrid LLMs</h5> 
              Rui Pan, Zhuang Wang, Zhen Jia, Can Karakus, Luca Zancato, Tri Dao, Yida Wang, Ravi Netravali <br>
              MLSys 2025<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <span class="badge badge-secondary">Sequence Modeling</span>
              <span class="badge badge-secondary">State Space Models</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2411.19379" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#marconi-abstract" role="button" aria-expanded="false" aria-controls="marconi-abstract">Abstract</a>
                <a href="https://github.com/ruipeterpan/marconi" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="marconi-abstract">
                <div class="card card-body">
                  Hybrid models that combine the language modeling capabilities of Attention layers with the efficiency of Recurrent
                  layers (e.g., State Space Models) have gained traction in practically supporting long contexts in Large Language
                  Model serving. Yet, the unique properties of these models complicate the usage of complementary efficiency
                  optimizations such as prefix caching that skip redundant computations across requests. Most notably, their use of
                  in-place state updates for recurrent layers precludes rolling back cache entries for partial sequence overlaps, and
                  instead mandates only exact-match cache hits; the effect is a deluge of (large) cache entries per sequence, most
                  of which yield minimal reuse opportunities. We present Marconi, the first system that supports efficient prefix
                  caching with Hybrid LLMs. Key to Marconi are its novel admission and eviction policies that more judiciously
                  assess potential cache entries based not only on recency, but also on (1) forecasts of their reuse likelihood across a
                  taxonomy of different hit scenarios, and (2) the compute savings that hits deliver relative to memory footprints.
                  Across diverse workloads and Hybrid models, Marconi achieves up to 34.4× higher token hit rates (71.1% or 617
                  ms lower TTFT) compared to state-of-the-art prefix caching systems.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Mowgli: Passively Learned Rate Control for Real-Time Video</h5> 
              Neil Agarwal, Rui Pan, Francis Y. Yan, Ravi Netravali <br>
              NSDI 2025<br>
              <span class="badge badge-secondary">ML for Systems</span>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2410.03339" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#mowgli-abstract" role="button" aria-expanded="false" aria-controls="mowgli-abstract">Abstract</a>
              </div>
              <div class="collapse" id="mowgli-abstract">
                <div class="card card-body">
                  Rate control algorithms are at the heart of video conferencing platforms, 
                  determining target bitrates that match dynamic network characteristics for high quality. 
                  Recent data-driven strategies have shown promise for this challenging task, 
                  but the performance degradation they introduce during training has been a nonstarter 
                  for many production services, precluding adoption. 
                  This paper aims to bolster the practicality of data-driven rate control by presenting 
                  an alternative avenue for experiential learning: 
                  leveraging purely existing telemetry logs produced by the incumbent algorithm in production. 
                  We observe that these logs often contain effective decisions, although often at the wrong times or in the wrong order. 
                  To realize this approach despite the inherent uncertainty that log-based learning brings 
                  (i.e., lack of feedback for new decisions), our system, Mowgli, 
                  combines a variety of robust learning techniques (i.e., conservatively reasoning 
                  about alternate behavior to minimize risk and using a richer model formulation to account for environmental noise). 
                  Across diverse networks (emulated and real-world), Mowgli outperforms the widely deployed GCC algorithm, 
                  increasing average video bitrates by 15-39% while reducing freeze rates by 60-100%.
                </div>
              </div>
            </li>
          </ul>
        </div>
      </div>
    </div>

    <!-- 2024 -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <h3>2024</h3>
          <ul>
            <li class="paper-item">
              <h5>Catastrophic jailbreak of open-source LLMs via exploiting generation</h5> 
              Yangsibo Huang, Samyak Gupta, Mengzhou Xia, Kai Li, Danqi Chen<br>
              ICLR 2024<br>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2310.06987" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#genexploit-abstract" role="button" aria-expanded="false" aria-controls="genexploit-abstract">Abstract</a>
              </div>
              <div class="collapse" id="genexploit-abstract">
                <div class="card card-body">
                  The rapid progress in open-source large language models (LLMs) is significantly
                  advancing AI development. Extensive efforts have been made before model release to align their behavior with human values, with the primary goal of ensuring their helpfulness and harmlessness. However, even carefully aligned models can be manipulated maliciously, leading to unintended behaviors, known as
                  “jailbreaks”. These jailbreaks are typically triggered by specific text inputs, often referred to as adversarial prompts. In this work, we propose the generation
                  exploitation attack, an extremely simple approach that disrupts model alignment
                  by only manipulating variations of decoding methods. By exploiting different
                  generation strategies, including varying decoding hyper-parameters and sampling
                  methods, we increase the misalignment rate from 0% to more than 95% across
                  11 language models including LLAMA2, VICUNA, FALCON, and MPT families,
                  outperforming state-of-the-art attacks with 30× lower computational cost. Finally, we propose an effective alignment method that explores diverse generation
                  strategies, which can reasonably reduce the misalignment rate under our attack.
                  Altogether, our study underscores a major failure in current safety evaluation and
                  alignment procedures for open-source LLMs, strongly advocating for more comprehensive red teaming and better alignment before releasing such models1
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>MadEye: Boosting Live Video Analytics Accuracy with Adaptive Camera Configurations</h5> 
              Mike Wong, Murali Ramanujam, Guha Balakrishnan, Ravi Netravali<br>
              NSDI 2024<br>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://michaeldwong.github.io/papers/madeye-nsdi24.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#madeye-abstract" role="button" aria-expanded="false" aria-controls="madeye-abstract">Abstract</a>
              </div>
              <div class="collapse" id="madeye-abstract">
                <div class="card card-body">
                  Camera orientations (i.e., rotation and zoom) govern the
                  content that a camera captures in a given scene, which in
                  turn heavily influences the accuracy of live video analytics
                  pipelines. However, existing analytics approaches leave this
                  crucial adaptation knob untouched, instead opting to only
                  alter the way that captured images from fixed orientations
                  are encoded, streamed, and analyzed. We present MadEye,
                  a camera-server system that automatically and continually
                  adapts orientations to maximize accuracy for the workload
                  and resource constraints at hand. To realize this using commodity pan-tilt-zoom (PTZ) cameras, MadEye embeds (1) a
                  search algorithm that rapidly explores the massive space of
                  orientations to identify a fruitful subset at each time, and (2) a
                  novel knowledge distillation strategy to efficiently (with only
                  camera resources) select the ones that maximize workload accuracy. Experiments on diverse workloads show that MadEye
                  boosts accuracy by 2.9-25.7% for the same resource usage, or
                  achieves the same accuracy with 2-3.7× lower resource costs.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>ADR-X: ANN-Assisted Wireless Link Rate Adaptation for Compute-Constrained Embedded Gaming Devices</h5> 
              Hao Yin, Murali Ramanujam, Joe Schaefer, Stan Adermann, Srihari Narlanka, Perry Lea, Ravi Netravali, Krishna Chintalapudi<br>
              NSDI 2024<br>
              <span class="badge badge-secondary">ML for Systems</span>
              <div class="mt-2">
                <a href="https://www.usenix.org/system/files/nsdi24-yin.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#adrx-abstract" role="button" aria-expanded="false" aria-controls="adrx-abstract">Abstract</a>
              </div>
              <div class="collapse" id="adrx-abstract">
                <div class="card card-body">
                  The wireless channel between gaming console and accessories
                  e.g. controllers and headsets, experiences extremely rapid
                  variations due to abrupt head and hand movements amidst
                  an exciting game. In the absence of prior studies on wireless
                  packet losses for console gaming, through extensive evaluations and user studies, we find that state-of-the-art rate adaptation schemes, unable to keep up with these rapid changes,
                  experience packet loss rates of 2-10% while loss rates that
                  are 10× lower (0.1-0.5%) are required to ensure a high quality gaming experience. We present ADR-X, an ANN-based
                  contextual multi-armed bandit rate adaptation technique that
                  continuously predicts and tracks the channel and picks appropriate data rates. A key challenge for ADR-X is that it must
                  run on power and compute constrained embedded devices
                  under realtime constraints. ADR-X addresses this challenge
                  by meticulously crafting an ANN that leverages existing communication theory results to incorporate domain knowledge.
                  This allows ADR-X to achieve 10× lower packet losses than
                  existing schemes while also running 100× faster than stateof-the-art reinforcement learning schemes, making it suitable
                  for deployment on embedded gaming devices.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>NetVigil: Robust and Low-Cost Anomaly Detection for East-West Data Center Security</h5> 
              Kevin Hsieh*, Mike Wong*, Santiago Segarra, Sathiya Kumaran Mani, Trevor Eberl, Anatoliy Panasyuk, Ravi Netravali, Ranveer Chandra, Srikanth Kandula<br>
              NSDI 2024<br>
              <span class="badge badge-secondary">ML for Systems</span>
              <span class="badge badge-secondary">Privacy and Security</span>
              <span class="badge badge-secondary">Novel ML Applications</span>
              <div class="mt-2">
                <a href="https://michaeldwong.github.io/papers/netvigil-nsdi24.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#netvigil-abstract" role="button" aria-expanded="false" aria-controls="netvigil-abstract">Abstract</a>
              </div>
              <div class="collapse" id="netvigil-abstract">
                <div class="card card-body">
                  The growing number of breaches in data centers
                  underscores an urgent need for more effective security. Traditional perimeter defense measures and static zero-trust approaches are unable to address the unique challenges that arise
                  from the scale, complexity, and evolving nature of today’s
                  data center networks. To tackle these issues, we introduce
                  NetVigil, a robust and cost-efficient anomaly detection system
                  specifically designed for east-west traffic within data center
                  networks. NetVigil adeptly extracts security-focused, graphbased features from network flow logs and employs domainspecific graph neural networks (GNNs) and contrastive learning techniques to strengthen its resilience against normal
                  traffic variations and adversarial evasion strategies. Our evaluation, over various attack scenarios and traces from real-world
                  production clusters, shows that NetVigil delivers significant
                  improvements in accuracy, cost, and detection latency compared to state-of-the-art anomaly detection systems, providing
                  a practical, supplementary security mechanism to protect the
                  east-west traffic within data center networks.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>
                Apparate: Rethinking Early Exits to Tame Latency-Throughput Tensions in ML Serving
                <img src="images/acm_available_1.1.png" height="25"/><img src="images/acm_functional_1.1.png" height="25"/><img src="images/acm_reproduced_1.1.png" height="25"/>
              </h5> 
              Yinwei Dai*, Rui Pan*, Anand Iyer, Kai Li, Ravi Netravali <br>
              SOSP 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://dl.acm.org/doi/pdf/10.1145/3694715.3695963" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#apparate-abstract" role="button" aria-expanded="false" aria-controls="apparate-abstract">Abstract</a>
                <a href="https://github.com/dywsjtu/apparate" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="apparate-abstract">
                <div class="card card-body">
                  Machine learning (ML) inference platforms are tasked with balancing two competing goals: 
                  ensuring high throughput given many requests, and delivering low-latency responses to support interactive applications. 
                  Unfortunately, existing platform knobs (e.g., batch sizes) fail to ease this fundamental tension, 
                  and instead only enable users to harshly trade off one property for the other. 
                  This paper explores an alternate strategy to taming throughput-latency tradeoffs by changing the granularity 
                  at which inference is performed. 
                  We present Apparate, a system that automatically applies and manages early exits (EEs) in ML models, 
                  whereby certain inputs can exit with results at intermediate layers. 
                  To cope with the time-varying overhead and accuracy challenges that EEs bring, 
                  Apparate repurposes exits to provide continual feedback that powers several novel runtime monitoring and adaptation strategies. 
                  Apparate lowers median response latencies by 40.5-91.5% and 10.0-24.2% for diverse CV and NLP classification workloads, 
                  and median time-per-token latencies by 70.4-77.9% for generative scenarios, 
                  without affecting throughputs or violating tight accuracy constraints.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Improving DNN Inference Throughput Using Practical, Per-Input Compute Adaptation</h5> 
              Anand Iyer, Mingyu Guan, Yinwei Dai, Rui Pan, Swapnil Gandhi, Ravi Netravali <br>
              SOSP 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://dl.acm.org/doi/pdf/10.1145/3694715.3695978" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#e3-abstract" role="button" aria-expanded="false" aria-controls="e3-abstract">Abstract</a>
              </div>
              <div class="collapse" id="e3-abstract">
                <div class="card card-body">
                  Machine learning inference platforms continue to face high request rates and strict latency constraints. 
                  Existing solutions largely focus on compressing models to substantially lower compute costs (and time) with mild accuracy degradations. 
                  This paper explores an alternate (but complementary) technique that trades off accuracy and resource costs on a per-input granularity: 
                  early exit models, which selectively allow certain inputs to exit a model from an intermediate layer. 
                  Though intuitive, early exits face fundamental deployment challenges, largely owing to the effects that exiting inputs have on batch size (and resource utilization) 
                  throughout model execution. We present E3, the first system that makes early exit models practical for realistic inference deployments. 
                  Our key insight is to split and replicate blocks of layers in models in a manner that maintains a constant batch size throughout execution, 
                  all the while accounting for resource requirements and communication overheads. Evaluations with NLP and vision models show that E3 can deliver up to 1.74× 
                  improvement in goodput (for a fixed cost) or 1.78× reduction in cost (for a fixed goodput). 
                  Additionally, E3's goodput wins generalize to autoregressive LLMs (2.8-3.8×) and compressed models (1.67×).
                </div>
              </div>
            </li>


            <li class="paper-item">
              <h5>Zero-TPrune: Zero-Shot Token Pruning through Leveraging of the Attention Graph in Pre-Trained Transformers</h5> 
              Hongjie Wang, Bhishma Dedhia, Niraj K Jha<br>
              CVPR 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://openaccess.thecvf.com/content/CVPR2024/papers/Wang_Zero-TPrune_Zero-Shot_Token_Pruning_through_Leveraging_of_the_Attention_Graph_CVPR_2024_paper.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#zerotprune-abstract" role="button" aria-expanded="false" aria-controls="zero-tprune-abstract">Abstract</a>
              </div>
              <div class="collapse" id="zero-tprune-abstract">
                <div class="card card-body">
                Deployment of Transformer models on edge devices is
becoming increasingly challenging due to the exponentially
growing inference cost that scales quadratically with the
number of tokens in the input sequence. Token pruning is an
emerging solution to address this challenge due to its ease
of deployment on various Transformer backbones. However, most token pruning methods require computationally
expensive fine-tuning, which is undesirable in many edge
deployment cases. In this work, we propose Zero-TPrune,
the first zero-shot method that considers both the importance and similarity of tokens in performing token pruning. It leverages the attention graph of pre-trained Transformer models to produce an importance distribution for
tokens via our proposed Weighted Page Rank (WPR) algorithm. This distribution further guides token partitioning
for efficient similarity-based pruning. Due to the elimination of the fine-tuning overhead, Zero-TPrune can prune
large models at negligible computational cost, switch between different pruning configurations at no computational
cost, and perform hyperparameter tuning efficiently. We
evaluate the performance of Zero-TPrune on vision tasks
by applying it to various vision Transformer backbones and
testing them on ImageNet. Without any fine-tuning, ZeroTPrune reduces the FLOPs cost of DeiT-S by 34.7% and
improves its throughput by 45.3% with only 0.4% accuracy loss. Compared with state-of-the-art pruning methods that require fine-tuning, Zero-TPrune not only eliminates the need for fine-tuning after pruning but also does so
with only 0.1% accuracy loss. Compared with state-of-theart fine-tuning-free pruning methods, Zero-TPrune reduces
accuracy loss by up to 49% with similar FLOPs budgets.
Project webpage: https://jha-lab.github.io/zerotprune.
                </div>
              </div>
            </li>

            <li class="paper-item">
              <h5>AT-EDM: Attention-Driven Training-Free Efficiency Enhancement of Diffusion Models</h5> 
              Hongjie Wang, Difan Liu, Yan Kang, Yijun Li, Zhe Lin, Niraj K. Jha, Yuchen Liu<br>
              CVPR 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <span class="badge badge-secondary">Emerging Paradigms</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2405.05252" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#atedm-abstract" role="button" aria-expanded="false" aria-controls="atedm-abstract">Abstract</a>
                <a href="https://atedm.github.io/" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Website</button>
                </a>
              </div>
              <div class="collapse" id="atedm-abstract">
                <div class="card card-body">
                  Diffusion Models (DMs) have exhibited superior performance in generating high-quality and diverse images. However, this exceptional performance comes at the cost of expensive architectural design, particularly due to the attention module heavily used in leading models. Existing works
                  mainly adopt a retraining process to enhance DM efficiency.
                  This is computationally expensive and not very scalable. To
                  this end, we introduce the Attention-driven Training-free
                  Efficient Diffusion Model (AT-EDM) framework that leverages attention maps to perform run-time pruning of redundant tokens, without the need for any retraining. Specifically, for single-denoising-step pruning, we develop a novel
                  ranking algorithm, Generalized Weighted Page Rank (GWPR), to identify redundant tokens, and a similarity-based
                  recovery method to restore tokens for the convolution operation. In addition, we propose a Denoising-Steps-Aware
                  Pruning (DSAP) approach to adjust the pruning budget
                  across different denoising timesteps for better generation
                  quality. Extensive evaluations show that AT-EDM performs favorably against prior art in terms of efficiency
                  (e.g., 38.8% FLOPs saving and up to 1.53× speed-up over
                  Stable Diffusion XL) while maintaining nearly the same
                  FID and CLIP scores as the full model. Project webpage:
                  https://atedm.github.io.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>DynaMo: Accelerating Language Model Inference with Dynamic Multi-Token Sampling</h5> 
              Shikhar Tuli, Chi-Heng Lin, Yen-Chang Hsu, Niraj Jha, Yilin Shen, Hongxia Jin<br>
              NAACL 2024<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://aclanthology.org/2024.naacl-long.182.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#dynamo-abstract" role="button" aria-expanded="false" aria-controls="dynamo-abstract">Abstract</a>
              </div>
              <div class="collapse" id="dynamo-abstract">
                <div class="card card-body">
                  Traditional language models operate autoregressively, i.e., they predict one token at a time. Rapid explosion in model sizes has resulted in high inference times. In this work, we propose DynaMo, a suite of multi-token prediction language models that reduce net inference times. Our models *dynamically* predict multiple tokens based on their confidence in the predicted joint probability distribution. We propose a lightweight technique to train these models, leveraging the weights of traditional autoregressive counterparts. Moreover, we propose novel ways to enhance the estimated joint probability to improve text generation quality, namely co-occurrence weighted masking and adaptive thresholding. We also propose systematic qualitative and quantitative methods to rigorously test the quality of generated text for non-autoregressive generation. One of the models in our suite, DynaMo-7.3B-T3, achieves same-quality generated text as the baseline (Pythia-6.9B) while achieving 2.57× speed-up with only 5.87% and 2.67% parameter and training time overheads, respectively.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference</h5> 
              Hengrui Zhang, August Ning, Rohan Baskar Prabhakar, and David Wentzlaff<br>
              ISCA 2024<br>
              <span class="badge badge-secondary">Hardware Design for ML</span>
              <div class="mt-2">
                <a href="https://parallel.princeton.edu/papers/isca24_llmcompass.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#llmcompass-abstract" role="button" aria-expanded="false" aria-controls="llmcompass-abstract">Abstract</a>
              </div>
              <div class="collapse" id="llmcompass-abstract">
                <div class="card card-body">
                  The past year has witnessed the increasing popularity of Large Language Models (LLMs). Their unprecedented
                  scale and associated high hardware cost have impeded their
                  broader adoption, calling for efficient hardware designs. With the
                  large hardware needed to simply run LLM inference, evaluating
                  different hardware designs becomes a new bottleneck.
                  This work introduces LLMCompass1
                  , a hardware evaluation
                  framework for LLM inference workloads. LLMCompass is fast,
                  accurate, versatile, and able to describe and evaluate different
                  hardware designs. LLMCompass includes a mapper to automatically find performance-optimal mapping and scheduling. It also
                  incorporates an area-based cost model to help architects reason
                  about their design choices. Compared to real-world hardware,
                  LLMCompass’ estimated latency achieves an average 10.9% error rate across various operators with various input sizes and an
                  average 4.1% error rate for LLM inference. With LLMCompass,
                  simulating a 4-NVIDIA A100 GPU node running GPT-3 175B
                  inference can be done within 16 minutes on commodity hardware,
                  including 26,400 rounds of the mapper’s parameter search.
                  With the aid of LLMCompass, this work draws architectural
                  implications and explores new cost-effective hardware designs. By
                  reducing the compute capability or replacing High Bandwidth
                  Memory (HBM) with traditional DRAM, these new designs
                  can achieve as much as 3.41x improvement in performance/cost
                  compared to an NVIDIA A100, making them promising choices
                  for democratizing LLMs.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Kraken: Inherently Parallel Transformers For Efficient Multi-Device Inference</h5> 
              Rohan Baskar Prabhakar, Hengrui Zhang, and David Wentzlaff <br>
              NeurIPS 2024<br>
              <span class="badge badge-secondary">Hardware Design for ML</span>
              <div class="mt-2">
                <a href="https://parallel.princeton.edu/papers/Kraken.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#kraken-abstract" role="button" aria-expanded="false" aria-controls="kraken-abstract">Abstract</a>
              </div>
              <div class="collapse" id="kraken-abstract">
                <div class="card card-body">
                  Large Transformer networks are increasingly used in settings where low inference latency is necessary to enable new applications and improve the end-user
                  experience. However, autoregressive inference is resource intensive and requires
                  parallelism for efficiency. Parallelism introduces collective communication that
                  is both expensive and represents a phase when hardware resources are underutilized. Towards mitigating this, Kraken is an evolution of the standard Transformer
                  architecture that is designed to complement existing tensor parallelism schemes
                  for efficient inference on multi-device systems. By introducing a fixed degree of
                  intra-layer model parallelism, the architecture allows collective operations to be
                  overlapped with compute, decreasing latency and increasing hardware utilization.
                  When trained on OpenWebText, Kraken models reach a similar perplexity as standard Transformers while also preserving their language modeling capabilities as
                  evaluated on the SuperGLUE benchmark. Importantly, when tested on multi-GPU
                  systems using TensorRT-LLM engines, Kraken speeds up Time To First Token by
                  a mean of 35.6% across a range of model sizes, context lengths, and degrees of
                  tensor parallelism 
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>SimPO: Simple Preference Optimization with a Reference-Free Reward</h5> 
              Yu Meng*, Mengzhou Xia*, Danqi Chen <br>
              NeurIPS 2024<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2405.14734" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#simpo-abstract" role="button" aria-expanded="false" aria-controls="simpo-abstract">Abstract</a>
              </div>
              <div class="collapse" id="simpo-abstract">
                <div class="card card-body">
                  Direct Preference Optimization (DPO) is a widely used offline preference optimization algorithm that reparameterizes reward functions in reinforcement learning from human feedback (RLHF) to enhance simplicity and training stability. 
                  In this work, we propose SimPO, a simpler yet more effective approach. The effectiveness of SimPO is attributed to a key design: using the average log probability of a sequence as the implicit reward. 
                  This reward formulation better aligns with model generation and eliminates the need for a reference model, making it more compute and memory efficient. 
                  Additionally, we introduce a target reward margin to the Bradley-Terry objective to encourage a larger margin between the winning and losing responses, further enhancing the algorithm's performance. 
                  We compare SimPO to DPO and its latest variants across various state-of-the-art training setups, including both base and instruction-tuned models like Mistral and Llama3. 
                  We evaluated on extensive instruction-following benchmarks, including AlpacaEval 2, MT-Bench, and the recent challenging Arena-Hard benchmark. 
                  Our results demonstrate that SimPO consistently and significantly outperforms existing approaches without substantially increasing response length. Specifically, SimPO outperforms DPO by up to 6.4 points on AlpacaEval 2 and by up to 7.5 points on Arena-Hard. 
                  Our top-performing model, built on Llama3-8B-Instruct, achieves a remarkable 44.7 length-controlled win rate on AlpacaEval 2 -- surpassing Claude 3 Opus on the leaderboard, and a 33.8 win rate on Arena-Hard -- making it the strongest 8B open-source model.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Lory: Fully Differentiable Mixture-of-Experts for Autoregressive Language Model Pre-training</h5> 
              Zexuan Zhong, Mengzhou Xia, Danqi Chen, Mike Lewis <br>
              COLM 2024<br>
              <span class="badge badge-secondary">Emerging Paradigms</span>
              <div class="mt-2">
                <a href="https://arxiv.org/abs/2405.03133" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#lory-abstract" role="button" aria-expanded="false" aria-controls="lory-abstract">Abstract</a>
              </div>
              <div class="collapse" id="lory-abstract">
                <div class="card card-body">
                  Mixture-of-experts (MoE) models facilitate efficient scaling; however, training the router network introduces the challenge of optimizing a non-differentiable, discrete objective. 
                  Recently, a fully-differentiable MoE architecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges experts in the parameter space; nevertheless, its effectiveness was only demonstrated in downstream fine-tuning on classification tasks. 
                  In this paper, we present Lory, the first approach that scales such architectures to autoregressive language model pre-training. 
                  Lory introduces two key techniques: (1) a causal segment routing strategy that achieves high efficiency for expert merging operations while preserving the autoregressive nature of language models; 
                  (2) a similarity-based data batching method that encourages expert specialization by grouping similar documents in training instances. 
                  We pre-train a series of Lory models on 150B tokens from scratch, with up to 32 experts and 30B (1.5B active) parameters. 
                  Experimental results show significant performance gains over parameter-matched dense models on both perplexity (+13.9%) and a variety of downstream tasks (+1.5%-11.1%). 
                  Despite segment-level routing, Lory models achieve competitive performance compared to state-of-the-art MoE models with token-level routing. We further demonstrate that the trained experts in Lory capture domain-level specialization without supervision. 
                  Our work highlights the potential of fully-differentiable MoE architectures for language model pre-training and advocates future research in this area.
                </div>
              </div>
            </li>


            <li class="paper-item">
              <h5>Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality</h5> 
              Tri Dao, Albert Gu <br>
              ICML 2024<br>
              <span class="badge badge-secondary">Emerging Paradigms</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2405.21060" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#transformers-are-ssms-abstract" role="button" aria-expanded="false" aria-controls="transformers-are-ssms-abstract">Abstract</a>
              </div>
              <div class="collapse" id="transformers-are-ssms-abstract">
                <div class="card card-body">
While Transformers have been the main architecture behind deep learning's success in language modeling, state-space models (SSMs) such as Mamba have recently been shown to match or outperform Transformers at small to medium scale. We show that these families of models are actually quite closely related, and develop a rich framework of theoretical connections between SSMs and variants of attention, connected through various decompositions of a well-studied class of structured semiseparable matrices. Our state space duality (SSD) framework allows us to design a new architecture (Mamba-2) whose core layer is an a refinement of Mamba's selective SSM that is 2-8X faster, while continuing to be competitive with Transformers on language modeling.
                </div>
              </div>
            </li>


            <li class="paper-item">
              <h5>FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning</h5> 
              Tri Dao<br>
              ICLR 2024<br>
              <span class="badge badge-secondary">Emerging Paradigms</span>
              <div class="mt-2">
                <a href="https://arxiv.org/abs/2307.08691" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#flashattention2-abstract" role="button" aria-expanded="false" aria-controls="flashattention2-abstract">Abstract</a>
              </div>
              <div class="collapse" id="flashattention2-abstract">
                <div class="card card-body">
Scaling Transformers to longer sequence lengths has been a major problem in the last several years, promising to improve performance in language modeling and high-resolution image understanding, as well as to unlock new applications in code, audio, and video generation. The attention layer is the main bottleneck in scaling to longer sequences, as its runtime and memory increase quadratically in the sequence length. FlashAttention exploits the asymmetric GPU memory hierarchy to bring significant memory saving (linear instead of quadratic) and runtime speedup (2-4× compared to optimized baselines), with no approximation. However, FlashAttention is still not nearly as fast as optimized matrix-multiply (GEMM) operations, reaching only 25-40\% of the theoretical maximum FLOPs/s. We observe that the inefficiency is due to suboptimal work partitioning between different thread blocks and warps on the GPU, causing either low-occupancy or unnecessary shared memory reads/writes. We propose FlashAttention-2, with better work partitioning to address these issues. In particular, we (1) tweak the algorithm to reduce the number of non-matmul FLOPs (2) parallelize the attention computation, even for a single head, across different thread blocks to increase occupancy, and (3) within each thread block, distribute the work between warps to reduce communication through shared memory. These yield around 2× speedup compared to FlashAttention, reaching 50-73\% of the theoretical maximum FLOPs/s on A100 and getting close to the efficiency of GEMM operations. We empirically validate that when used end-to-end to train GPT-style models, FlashAttention-2 reaches training speed of up to 225 TFLOPs/s per A100 GPU (72\% model FLOPs utilization).
                </div>
              </div>
            </li>

            <li class="paper-item">
              <h5>Sheared LLaMA: Accelerating Language Model Pre-training via Structured Pruning</h5> 
              Mengzhou Xia, Tianyu Gao, Zhiyuan Zeng, Danqi Chen<br>
              ICLR 2024<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://arxiv.org/abs/2310.06694" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#sheared-llama-abstract" role="button" aria-expanded="false" aria-controls="sheared-llama-abstract">Abstract</a>
              </div>
              <div class="collapse" id="sheared-llama-abstract">
                <div class="card card-body">
                  The popularity of LLaMA (Touvron et al., 2023a;b) and other recently emerged moderate-sized large language models (LLMs) highlights the potential of building smaller yet powerful LLMs. 
                  Regardless, the cost of training such models from scratch on trillions of tokens remains high. In this work, we study structured pruning as an effective means to develop smaller LLMs from pre-trained, larger models. 
                  Our approach employs two key techniques: (1) targeted structured pruning, which prunes a larger model to a specified target shape by removing layers, heads, and intermediate and hidden dimensions in an end-to-end manner, 
                  and (2) dynamic batch loading, which dynamically updates the composition of sampled data in each training batch based on varying losses across different domains. We demonstrate the efficacy of our approach by presenting the Sheared-LLaMA series, pruning the LLaMA2-7B model down to 1.3B and 2.7B parameters. 
                  Sheared-LLaMA models outperform state-of-the-art open-source models of equivalent sizes, such as Pythia, INCITE, and OpenLLaMA models, on a wide range of downstream and instruction tuning evaluations, while requiring only 3% of compute compared to training such models from scratch. 
                  This work provides compelling evidence that leveraging existing LLMs with structured pruning is a far more cost-effective approach for building smaller LLMs.
                </div>
              </div>
            </li>
          </ul>
        </div>
      </div>
    </div>

    <!-- 2023 -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <h3>2023</h3>
          <ul>


            <li class="paper-item">
              <h5>SCouT: Synthetic Counterfactuals via Spatiotemporal Transformers for Actionable Healthcare</h5> 
              Bhishma Dedhia, Roshini Balasubramanian, Niraj K. Jha<br>
              ACM Transactions on Computing for Healthcare, October 2023<br>
              <span class="badge badge-secondary">Novel ML Applications</span>
              <div class="mt-2">
                <a href="https://dl.acm.org/doi/10.1145/3617180" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#scout-abstract" role="button" aria-expanded="false" aria-controls="scout-abstract">Abstract</a>
              </div>
              <div class="collapse" id="scout-abstract">
                <div class="card card-body">
The synthetic control method has pioneered a class of powerful data-driven techniques to estimate the counterfactual reality of a unit from donor units. At its core, the technique involves a linear model fitted on the pre-intervention period that combines donor outcomes to yield the counterfactual. However, linearly combining spatial information at each time instance using time-agnostic weights fails to capture important inter-unit and intra-unit temporal contexts and complex nonlinear dynamics of real data. We instead propose an approach to use local spatiotemporal information before the onset of the intervention as a promising way to estimate the counterfactual sequence. To this end, we suggest a Transformer model that leverages particular positional embeddings, a modified decoder attention mask, and a novel pre-training task to perform spatiotemporal sequence-to-sequence modeling. Our experiments on synthetic data demonstrate the efficacy of our method in the typical small donor pool setting and its robustness against noise. We also generate actionable healthcare insights at the population and patient levels by simulating a state-wide public health policy to evaluate its effectiveness, an in silico trial for asthma medications to support randomized controlled trials, and a medical intervention for patients with Friedreich’s ataxia to improve clinical decision making and promote personalized therapy (code is available at https://github.com/JHA-Lab/scout).
                </div>
              </div>
            </li>

            <li class="paper-item">
              <h5>EdgeTran: Device-Aware Co-Search of Transformers for Efficient Inference on Mobile Edge Platforms</h5> 
              Shikhar Tuli, Niraj K Jha<br>
              IEEE Transactions on Mobile Computing 2023<br>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://ieeexplore.ieee.org/abstract/document/10301516" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#edgetran-abstract" role="button" aria-expanded="false" aria-controls="edgetran-abstract">Abstract</a>
              </div>
              <div class="collapse" id="edgetran-abstract">
                <div class="card card-body">
                Automated design of efficient transformer models has recently attracted significant attention from industry and academia. However, most works only focus on certain metrics while searching for the best-performing transformer architecture. Furthermore, running traditional, complex, and large transformer models on low-compute edge platforms is a challenging problem. In this work, we propose a framework, called ProTran, to profile the hardware performance measures for a design space of transformer architectures and a diverse set of edge devices. We use this profiler in conjunction with the proposed co-search technique to obtain the best-performing models that have high accuracy on the given task and minimize latency, energy consumption, and peak power draw to enable edge deployment. We refer to our framework for co-optimizing accuracy and hardware performance measures as EdgeTran. It searches for the best transformer model and edge device pair. Finally, we propose GPTran, a multi-stage block-level grow-and-prune post-processing step that further improves accuracy in a hardware-aware manner. The obtained transformer model is 2.8× smaller and has a 0.8% higher GLUE score than the baseline (BERT-Base). Inference with it on the selected edge device enables 15.0% lower latency, 10.0× lower energy, and 10.8× lower peak power draw compared to an off-the-shelf GPU.
                </div>
              </div>
            </li>

            <li class="paper-item">
              <h5>Privacy Implications of Retrieval-Based Language Models</h5> 
              Yangsibo Huang, Samyak Gupta, Zexuan Zhong, Kai Li, Danqi Chen<br>
              EMNLP 2023<br>
              <span class="badge badge-secondary">Compound AI Systems</span>
              <span class="badge badge-secondary">Privacy and Security</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2305.14888" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#retrievalprivacy-abstract" role="button" aria-expanded="false" aria-controls="retrievalprivacy-abstract">Abstract</a>
              </div>
              <div class="collapse" id="retrievalprivacy-abstract">
                <div class="card card-body">
                  Retrieval-based language models (LMs) have
                  demonstrated improved interpretability, factuality, and adaptability compared to their parametric counterparts, by incorporating retrieved
                  text from external datastores. While it is well
                  known that parametric models are prone to leaking private data, it remains unclear how the addition of a retrieval datastore impacts model
                  privacy. In this work, we present the first study
                  of privacy risks in retrieval-based LMs, particularly kNN-LMs. Our goal is to explore the
                  optimal design and training procedure in domains where privacy is of concern, aiming to
                  strike a balance between utility and privacy.
                  Crucially, we find that kNN-LMs are more susceptible to leaking private information from
                  their private datastore than parametric models.
                  We further explore mitigations of privacy risks.
                  When privacy information is targeted and readily detected in the text, we find that a simple
                  sanitization step would completely eliminate
                  the risks, while decoupling query and key encoders achieves an even better utility-privacy
                  trade-off. Otherwise, we consider strategies of
                  mixing public and private data in both datastore and encoder training. While these methods
                  offer modest improvements, they leave considerable room for future work. Together, our
                  findings provide insights for practitioners to
                  better understand and mitigate privacy risks in
                  retrieval-based LMs. 
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Marvolo: Programmatic Data Augmentation for Deep Malware Detection</h5> 
              Mike Wong, Edward Raff, James Holt, Ravi Netravali<br>
              ECML PKDD 2023<br>
              <span class="badge badge-secondary">ML for Systems</span>
              <span class="badge badge-secondary">Privacy and Security</span>
              <div class="mt-2">
                <a href="https://michaeldwong.github.io/papers/marvolo.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#marvolo-abstract" role="button" aria-expanded="false" aria-controls="marvolo-abstract">Abstract</a>
              </div>
              <div class="collapse" id="marvolo-abstract">
                <div class="card card-body">
                  Data acquisition for ML-driven malware detection is challenging. While large commercial datasets exist, they are prohibitively
                  expensive. On the other hand, an entity (e.g., a bank or government),
                  may be targeted with unique malware, but the data samples available
                  will never be sufficient to train a bespoke ML-based detector. While data
                  augmentation has been a key component in improving deep learning models by providing requisite diversity for generalization, it has proven far
                  more challenging for malware detection. The main challenges are that (1)
                  determining the augmentations to make is not straightforward, (2) operations are on binaries rather than source code (which is not available),
                  complicating correctness and understanding, and (3) labeling new files
                  mandates expensive binary reverse engineering. We present Marvolo for
                  creating realistic, semantics preserving transformations that mimic the
                  code alterations made by malware authors in practice, allowing us to generate augmented data on raw binary files. This also enables Marvolo to
                  safely propagate labels to newly-generated data. Across several malware
                  datasets and recent ML-based detectors, Marvolo improves accuracy
                  and AUC by up to 5% and 10% respectively, while boosting efficiency by
                  79x by avoiding redundant computation.   
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>MUX-PLMs: Data multiplexing for high-throughput language models</h5> 
              Vishvak Murahari, Ameet Deshpande, Carlos E Jimenez, Izhak Shafran, Mingqiu Wang, Yuan Cao, Karthik Narasimhan <br>
              EMNLP 2023<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2302.12441" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#muxplms-abstract" role="button" aria-expanded="false" aria-controls="muxplms-abstract">Abstract</a>
                <a href="https://github.com/state-spaces/mamba/" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="muxplms-abstract">
                <div class="card card-body">
                  The widespread adoption of large language
                  models such as ChatGPT and Bard has led
                  to unprecedented demand for these technologies. The burgeoning cost of inference for everincreasing model sizes coupled with hardware
                  shortages has limited affordable access and
                  poses a pressing need for efficiency approaches
                  geared towards high throughput and performance. Multi-input multi-output (MIMO) algorithms such as data multiplexing, offer a
                  promising solution with a many-fold increase
                  in throughput by performing inference for multiple inputs at the cost of a single input. Yet
                  these approaches are not currently performant
                  enough to be deployed in modern systems. We
                  change that by developing MUX-PLMs, a class
                  of high throughput pre-trained language models
                  (PLMs) trained with data multiplexing, that can
                  be fine-tuned for any downstream task to yield
                  high-throughput high-performance. Our novel
                  multiplexing and demultiplexing modules proficiently entangle and disentangle inputs, and enable high-performance high throughput MUXPLMs that are competitive with vanilla PLMs
                  while achieving 2x/5x inference speedup with
                  only a 1 − 4% drop on a broad suite of tasks.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Mamba: Linear-Time Sequence Modeling with Selective State Spaces</h5> 
              Albert Gu*, Tri Dao* <br>
              COLM 2023<br>
              <span class="badge badge-secondary">State Space Models</span>
              <span class="badge badge-secondary">Emerging Paradigms</span>
              <span class="badge badge-secondary">Sequence Modeling</span>
              <div class="mt-2">
                <a href="https://arxiv.org/pdf/2312.00752" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#mamba-abstract" role="button" aria-expanded="false" aria-controls="mamba-abstract">Abstract</a>
                <a href="https://github.com/state-spaces/mamba/" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="mamba-abstract">
                <div class="card card-body">
                  Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers' computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of efficient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5× higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Shockwave: Fair and Efficient Cluster Scheduling for Dynamic Adaptation in Machine Learning</h5> 
              Pengfei Zheng, Rui Pan, Tarannum Khan, Shivaram Venkataraman, Aditya Akella <br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://www.usenix.org/system/files/nsdi23-zheng.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#shockwave-abstract" role="button" aria-expanded="false" aria-controls="shockwave-abstract">Abstract</a>
                <a href="https://github.com/uw-mad-dash/shockwave" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="shockwave-abstract">
                <div class="card card-body">
                  Dynamic adaptation has become an essential technique in accelerating distributed machine learning (ML) training: 
                  Recent studies have shown that dynamically adjusting model structure (e.g., lottery ticket hypothesis) or hyperparameters (e.g., batch size) 
                  can significantly accelerate training without sacrificing accuracy. However, existing ML cluster schedulers are not designed to handle dynamic adaptation. 
                  We show that existing schemes fail to provide fairness and degrade system efficiency when the training throughput changes over time under dynamic adaptation. 
                  We design Shockwave, a scheduler with future planning that builds on two key ideas. 
                  First, Shockwave extends classic market theory from static settings to dynamic settings to co-optimize efficiency and fairness. 
                  Second, Shockwave utilizes stochastic dynamic programming to handle uncertain, dynamic throughput. 
                  We build a system for Shockwave and validate its performance with both trace-driven simulation and cluster experiments. 
                  Results show that for traces of ML jobs with dynamic adaptation, Shockwave improves makespan by 1.3× and fairness by 2× when compared with existing fair scheduling schemes.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>ModelKeeper: Accelerating DNN Training via Automated Training Warmup</h5> 
              Fan Lai, Yinwei Dai, Harsha Madhyastha, Mosharaf Chowdhury <br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://www.usenix.org/system/files/nsdi23-lai-fan.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#modelkeeper-abstract" role="button" aria-expanded="false" aria-controls="modelkeeper-abstract">Abstract</a>
                <a href="https://github.com/SymbioticLab/ModelKeeper" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">Code</button>
                </a>
              </div>
              <div class="collapse" id="modelkeeper-abstract">
                <div class="card card-body">
                  With growing deployment of machine learning (ML) models,
                  ML developers are training or re-training increasingly more
                  deep neural networks (DNNs). They do so to find the most
                  suitable model that meets their accuracy requirement while
                  satisfying the resource and timeliness constraints of the target
                  environment. In large shared clusters, the growing number
                  of neural architecture search (NAS) and training jobs often
                  result in models sharing architectural similarities with others
                  from the same or a different ML developer. However, existing
                  solutions do not provide a systematic mechanism to identify
                  and leverage such similarities.
                  We present ModelKeeper, the first automated training
                  warmup system that accelerates DNN training by repurposing previously-trained models in a shared cluster. Our key
                  insight is that initializing a training job’s model by transforming an already-trained model’s weights can jump-start it and
                  reduce the total amount of training needed. However, models submitted over time can differ in their architectures and
                  accuracy. Given a new model to train, ModelKeeper scalably
                  identifies its architectural similarity with previously trained
                  models, selects a parent model with high similarity and good
                  model accuracy, and performs structure-aware transformation
                  of weights to preserve maximal information from the parent
                  model during the warmup of new model weights. Our evaluations across thousands of CV and NLP models show that
                  ModelKeeper achieves 1.3×–4.3× faster training completion
                  with little overhead and no reduction in model accuracy.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Boggart: Towards General-Purpose Acceleration of Retrospective Video Analytics</h5> 
              Neil Agarwal, Ravi Netravali<br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://www.cs.princeton.edu/~ravian/publications/boggart_nsdi23.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#boggart-abstract" role="button" aria-expanded="false" aria-controls="boggart-abstract">Abstract</a>
              </div>
              <div class="collapse" id="boggart-abstract">
                <div class="card card-body">
                  Commercial retrospective video analytics platforms have increasingly adopted general interfaces to support the custom
                  queries and convolutional neural networks (CNNs) that different applications require. However, existing optimizations
                  were designed for settings where CNNs were platform- (not
                  user-) determined, and fail to meet at least one of the following key platform goals when that condition is violated:
                  reliable accuracy, low latency, and minimal wasted work.
                  We present Boggart, a system that simultaneously meets
                  all three goals while supporting the generality that today’s
                  platforms seek. Prior to queries being issued, Boggart carefully employs traditional computer vision algorithms to generate indices that are imprecise, but are fundamentally comprehensive across different CNNs/queries. For each issued
                  query, Boggart employs new techniques to quickly characterize the imprecision of its index, and sparingly run CNNs (and
                  propagate results to other frames) in a way that bounds accuracy drops. Our results highlight that Boggart’s improved
                  generality comes at low cost, with speedups that match (and
                  most often, exceed) prior, model-specific approaches.  
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>GEMEL: Model Merging for Memory-Efficient, Real-Time Video Analytics at the Edge</h5> 
              Arthi Padmanabhan, Neil Agarwal, Anand Iyer, Ganesh Ananthanarayanan, Yuanchao Shu, Nikolaos Karianakis, Harry Xu, Ravi Netravali<br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Efficient Inference</span>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://www.cs.princeton.edu/~ravian/publications/gemel_nsdi23.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#gemel-abstract" role="button" aria-expanded="false" aria-controls="gemel-abstract">Abstract</a>
              </div>
              <div class="collapse" id="gemel-abstract">
                <div class="card card-body">
                  Video analytics pipelines have steadily shifted to edge deployments to reduce bandwidth overheads and privacy violations, but in doing so, face an ever-growing resource tension. Most notably, edge-box GPUs lack the memory needed
                  to concurrently house the growing number of (increasingly
                  complex) models for real-time inference. Unfortunately, existing solutions that rely on time/space sharing of GPU resources are insufficient as the required swapping delays result in unacceptable frame drops and accuracy loss. We
                  present model merging, a new memory management technique that exploits architectural similarities between edge
                  vision models by judiciously sharing their layers (including weights) to reduce workload memory costs and swapping delays. Our system, Gemel, efficiently integrates merging into existing pipelines by (1) leveraging several guiding observations about per-model memory usage and interlayer dependencies to quickly identify fruitful and accuracypreserving merging configurations, and (2) altering edge inference schedules to maximize merging benefits. Experiments across diverse workloads reveal that Gemel reduces
                  memory usage by up to 60.7%, and improves overall accuracy by 8-39% relative to time or space sharing alone
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs</h5> 
              John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, Harry Xu<br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://www.cs.princeton.edu/~ravian/publications/bamboo_nsdi23.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#bamboo-abstract" role="button" aria-expanded="false" aria-controls="bamboo-abstract">Abstract</a>
              </div>
              <div class="collapse" id="bamboo-abstract">
                <div class="card card-body">
                  DNN models across many domains continue to grow in size,
                  resulting in high resource requirements for effective training,
                  and unpalatable (and often unaffordable) costs for organizations and research labs across scales. This paper aims to
                  significantly reduce training costs with effective use of preemptible instances, i.e., those that can be obtained at a much
                  cheaper price while idle, but may be preempted whenever
                  requested by priority users. Doing so, however, requires new
                  forms of resiliency and efficiency to cope with the possibility
                  of frequent preemptions – a failure model that is drastically
                  different from the occasional failures in normal cluster settings that existing checkpointing techniques target.
                  We present Bamboo, a distributed system that tackles these
                  challenges by introducing redundant computations into the
                  training pipeline, i.e., whereby one node performs computations over not only its own layers but also over some layers in its neighbor. Our key insight is that training large
                  models often requires pipeline parallelism where “pipeline
                  bubbles” naturally exist. Bamboo carefully fills redundant
                  computations into these bubbles, providing resilience at a low
                  cost. Across a variety of widely used DNN models, Bamboo
                  outperforms traditional checkpointing by 3.7× in training
                  throughput, and reduces costs by 2.4× compared to a setting
                  where on-demand instances are used.
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>RECL: Responsive Resource-Efficient Continuous Learning for Video Analytics</h5> 
              Mehrdad Khani, Ganesh Ananthanarayanan, Kevin Hsieh, Junchen Jiang, Ravi Netravali, Yuanchao Shu, Mohammad Alizadeh, Victor Bahl<br>
              NSDI 2023<br>
              <span class="badge badge-secondary">Edge AI Systems</span>
              <div class="mt-2">
                <a href="https://www.cs.princeton.edu/~ravian/publications/bamboo_nsdi23.pdf" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#recl-abstract" role="button" aria-expanded="false" aria-controls="recl-abstract">Abstract</a>
              </div>
              <div class="collapse" id="recl-abstract">
                <div class="card card-body">
                  Continuous learning has recently shown promising results
                  for video analytics by adapting a lightweight “expert” DNN
                  model for each specific video scene to cope with the data drift
                  in real time. However, current adaptation approaches either
                  rely on periodic retraining and suffer its delay and significant
                  compute costs or rely on selecting historical models and
                  incur accuracy loss by not fully leveraging the potential of
                  persistent retraining. Without dynamically optimizing the
                  resource sharing among model selection and retraining, both
                  approaches have a diminishing return at scale. RECL is
                  a new video-analytics framework that carefully integrates
                  model reusing and online model retraining, allowing it
                  to quickly adapt the expert model given any video frame
                  samples. To do this, RECL (i) shares across edge devices
                  a (potentially growing) “model zoo” that comprises expert
                  models previously trained for all edge devices, enabling history
                  model reuse across video sessions, (ii) uses a fast procedure to
                  online select a highly accurate expert model from this shared
                  model zoo, and (iii) dynamically optimizes GPU allocation
                  among model retraining, model selection, and timely updates
                  of the model zoo. Our evaluation of RECL over 70 hours of
                  real-world videos across two vision tasks (object detection and
                  classification) shows substantial performance gains compared
                  to prior work, further amplifying over the system lifetime. 
                </div>
              </div>
            </li>
            <li class="paper-item">
              <h5>Auxo: Efficient Federated Learning via Scalable Client Clustering</h5> 
              Jiachen Liu, Fan Lai, Yinwei Dai, Aditya Akella, Harsha Madhyastha, Mosharaf Chowdhury <br>
              SoCC 2023<br>
              <span class="badge badge-secondary">Efficient Training</span>
              <div class="mt-2">
                <a href="https://dl.acm.org/doi/pdf/10.1145/3620678.3624651" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#auxo-abstract" role="button" aria-expanded="false" aria-controls="auxo-abstract-abstract">Abstract</a>
              </div>
              <div class="collapse" id="auxo-abstract">
                <div class="card card-body">
                  Federated learning (FL) is an emerging machine learning
                  (ML) paradigm that enables heterogeneous edge devices to
                  collaboratively train ML models without revealing their raw
                  data to a logically centralized server. However, beyond the
                  heterogeneous device capacity, FL participants often exhibit
                  differences in their data distributions, which are not independent and identically distributed (Non-IID). Many existing
                  works present point solutions to address issues like slow
                  convergence, low final accuracy, and bias in FL, all stemming
                  from client heterogeneity.
                  In this paper, we explore an additional layer of complexity to mitigate such heterogeneity by grouping clients with
                  statistically similar data distributions (cohorts). We propose
                  Auxo to gradually identify such cohorts in large-scale, lowavailability, and resource-constrained FL populations. Auxo
                  then adaptively determines how to train cohort-specific models in order to achieve better model performance and ensure
                  resource efficiency. Our extensive evaluations show that,
                  by identifying cohorts with smaller heterogeneity and performing efficient cohort-based training, Auxo boosts various
                  existing FL solutions in terms of final accuracy (2.1%–8.2%),
                  convergence time (up to 2.2×), and model bias (4.8% - 53.8%)
                </div>
              </div>
            </li>
          </ul>
        </div>
      </div>
    </div>


    <!-- 2022 -->
    <div class="container">
      <div class="row">
        <div class="col-sm-12">
          <h3>2022</h3>
          <ul>
            <li class="paper-item">
              <h5>ML-FEED: Machine Learning Framework for Efficient Exploit Detection</h5> 
              Tanujay Saha, Tamjid Al Rahat, Najwa Aaraj, Yuan Tian, Niraj K. Jha<br>
              IEEE 4th International Conference on Trust, Privacy and Security in Intelligent Systems, and Applications (TPS-ISA)<br>
              <span class="badge badge-secondary">Novel ML Applications</span>
              <span class="badge badge-secondary">Privacy and Security</span>
              <div class="mt-2">
                <a href="https://ieeexplore.ieee.org/document/10063446" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#mlfeed-abstract" role="button" aria-expanded="false" aria-controls="mlfeed-abstract-abstract">Abstract</a>
              </div>
              <div class="collapse" id="mlfeed-abstract">
                <div class="card card-body">
Machine learning (ML)-based methods have recently become attractive for detecting security vulnerability exploits. Unfortunately, state-of-the-art ML models like long short-term memories (LSTMs) and transformers incur significant computation overheads. This overhead makes it infeasible to deploy them in real-time environments. We propose a novel ML-based exploit detection model, ML-FEED, that enables highly efficient inference without sacrificing performance. We develop a novel automated technique to extract vulnerability patterns from the Common Weakness Enumeration (CWE) and Common Vulnerabilities and Exposures (CVE) databases. This feature enables ML-FEED to be aware of the latest cyber weaknesses. Second, it is not based on the traditional approach of classifying sequences of application programming interface (API) calls into exploit categories. Such traditional methods that process entire sequences incur huge computational overheads. Instead, ML-FEED operates at a finer granularity and predicts the exploits triggered by every API call of the program trace. Then, it uses a state table to update the states of these potential exploits and track the progress of potential exploit chains. ML-FEED also employs a feature engineering approach that uses natural language processing-based word embeddings, frequency vectors, and one-hot encoding to detect semantically-similar instruction calls. Then, it updates the states of the predicted exploit categories and triggers an alarm when a vulnerability fingerprint executes. Our experiments show that ML-FEED is 72.9× and 75, 828.9× faster than state-of-the-art lightweight LSTM and transformer models, respectively. We trained and tested ML-FEED on 79 real-world exploit categories. It predicts categories of exploit in real-time with 98.2% precision, 97.4% recall, and 97.8% F1 score. These results also outperform the LSTM and transformer baselines. In addition, we evaluated ML-FEED on the attack traces of CVE vulnerability exploits in three popular Java libraries and detected all three reported critical vulnerabilities in them.                

                </div>
              </div>
            </li>

            <li class="paper-item">
              <h5>MHDeep: Mental health disorder detection system based on body-area and deep neural networks</h5> 
              Shayan Hassantabar, Joe Zhang, Hongxu Yin, Niraj K. Jha<br>
              ACM Transactions on Embedded Computing Systems<br>
              <span class="badge badge-secondary">Novel ML Applications</span>
              <div class="mt-2">
                <a href="https://dl.acm.org/doi/pdf/10.1145/3527170" target="_blank">
                  <button type="button" class="btn btn-outline-primary btn-sm">PDF</button>
                </a>
                <a class="btn btn-outline-primary btn-sm" data-toggle="collapse" href="#mhdeep-abstract" role="button" aria-expanded="false" aria-controls="mhdeep-abstract-abstract">Abstract</a>
              </div>
              <div class="collapse" id="mhdeep-abstract">
                <div class="card card-body">
Mental health problems impact the quality of life of millions of people around the world. However, diagnosis
of mental health disorders is a challenging problem that often relies on self-reporting by patients about their
behavioral patterns and social interactions. Therefore, there is a need for new strategies for diagnosis and
daily monitoring of mental health conditions. The recent introduction of body-area networks consisting of a
plethora of accurate sensors embedded in smartwatches and smartphones and edge-compatible deep neural
networks (DNNs) points toward a possible solution. Such wearable medical sensors (WMSs) enable continuous monitoring of physiological signals in a passive and non-invasive manner. However, disease diagnosis
based on WMSs and DNNs, and their deployment on edge devices, such as smartphones, remains a challenging problem. These challenges stem from the difficulty of feature engineering and knowledge distillation
from the raw sensor data, as well as the computational and memory constraints of battery-operated edge
devices. To this end, we propose a framework called MHDeep that utilizes commercially available WMSs and
efficient DNN models to diagnose three important mental health disorders: schizoaffective, major depressive,
and bipolar. MHDeep uses eight different categories of data obtained from sensors integrated in a smartwatch
and smartphone. These categories include various physiological signals and additional information on motion
patterns and environmental variables related to the wearer. MHDeep eliminates the need for manual feature
engineering by directly operating on the data streams obtained from participants. Because the amount of data
is limited, MHDeep uses a synthetic data generation module to augment real data with synthetic data drawn
from the same probability distribution. We use the synthetic dataset to pre-train the weights of the DNN
models, thus imposing a prior on the weights. We use a grow-and-prune DNN synthesis approach to learn
both architecture and weights during the training process. We use three different data partitions to evaluate
the MHDeep models trained with data collected from 74 individuals. We conduct two types of evaluations: at
the data instance level and at the patient level. MHDeep achieves an average test accuracy, across the three
data partitions, of 90.4%, 87.3%, and 82.4%, respectively, for classifications between healthy and schizoaffective disorder instances, healthy and major depressive disorder instances, and healthy and bipolar disorder
instances. At the patient level, MHDeep DNN models achieve an accuracy of 100%, 100%, and 90.0% for the
three mental health disorders, respectively, based on inference that uses 40, 16, and 22 minutes of sensor data
collection from each patient.

                </div>
              </div>
            </li>

          </ul>
        </div>
      </div>
    </div>


    <!-- End-of-the-page jumbotron -->
    <div class="jumbotron text-center" style="padding-top: 10px; padding-bottom: 10px; margin-bottom:0">
      <div class="container">
        <div class="col-sm-12">
          <a href="https://github.com/SysML-Princeton" class="smlinks" target="_blank"><i class="fab fa-github-square fa-2x"></i></a>
        </div>
      </div>
      <p>@ 2025 SAIL@Princeton. Powered by Bootstrap.</p>
    </div>

    <!-- Optional JavaScript -->
    <!-- jQuery first, then Popper.js, then Bootstrap JS -->
    <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js" integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n" crossorigin="anonymous"></script>
    <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js" integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" crossorigin="anonymous"></script>
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js" integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" crossorigin="anonymous"></script>
  </body>
</html>