@inproceedings{potamitis-etal-2025-cache,
title = "Cache Saver: A Modular Framework for Efficient, Affordable, and Reproducible {LLM} Inference",
author = "Potamitis, Nearchos and
Klein, Lars Henning and
Mohammadi, Bardia and
Xu, Chongyang and
Mukherjee, Attreyee and
Tandon, Niket and
Bindschaedler, Laurent and
Arora, Akhil",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1402/",
doi = "10.18653/v1/2025.findings-emnlp.1402",
pages = "25703--25724",
ISBN = "979-8-89176-335-7",
abstract = "Inference constitutes the majority of costs throughout the lifecycle of a large language model (LLM). While numerous LLM inference engines focusing primarily on low-level optimizations have been developed, there is a scarcity of non-intrusive client-side frameworks that perform high-level optimizations. In this paper, we introduce Cache Saver, a modular, plug-and-play, and asynchronous framework that facilitates high-level inference optimizations, thereby integrating cleanly into existing systems without requiring changes to the end-user application logic or the underlying LLM. The key novelty is a *namespace-aware list-valued cache* that ensures *statistical integrity* of LLM responses by generating *i.i.d.* responses within a namespace as well as ensuring *reproducibility*. Moreover, as a direct consequence of operating at a high level, Cache Saver supports both local and online models. We conduct extensive experiments with five representative state-of-the-art reasoning strategies, five diverse benchmark tasks, and three different LLMs. On average across all methods, tasks, and LLMs, Cache Saver reduces cost by $\simeq 25\%$ and CO$_2$ by $\simeq 35\%$. Notably, Cache Saver excels in practical machine learning scenarios such as benchmarking across multiple methods or conducting ablation analysis of a specific method, obtaining substantial cost and carbon footprint reduction of $\simeq 60\%$. Cache Saver is publicly available at [https://github.com/au-clan/cachesaver](https://github.com/au-clan/cachesaver)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="potamitis-etal-2025-cache">
<titleInfo>
<title>Cache Saver: A Modular Framework for Efficient, Affordable, and Reproducible LLM Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nearchos</namePart>
<namePart type="family">Potamitis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lars</namePart>
<namePart type="given">Henning</namePart>
<namePart type="family">Klein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bardia</namePart>
<namePart type="family">Mohammadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chongyang</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Attreyee</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niket</namePart>
<namePart type="family">Tandon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Bindschaedler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akhil</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Inference constitutes the majority of costs throughout the lifecycle of a large language model (LLM). While numerous LLM inference engines focusing primarily on low-level optimizations have been developed, there is a scarcity of non-intrusive client-side frameworks that perform high-level optimizations. In this paper, we introduce Cache Saver, a modular, plug-and-play, and asynchronous framework that facilitates high-level inference optimizations, thereby integrating cleanly into existing systems without requiring changes to the end-user application logic or the underlying LLM. The key novelty is a *namespace-aware list-valued cache* that ensures *statistical integrity* of LLM responses by generating *i.i.d.* responses within a namespace as well as ensuring *reproducibility*. Moreover, as a direct consequence of operating at a high level, Cache Saver supports both local and online models. We conduct extensive experiments with five representative state-of-the-art reasoning strategies, five diverse benchmark tasks, and three different LLMs. On average across all methods, tasks, and LLMs, Cache Saver reduces cost by \simeq 25% and CO₂ by \simeq 35%. Notably, Cache Saver excels in practical machine learning scenarios such as benchmarking across multiple methods or conducting ablation analysis of a specific method, obtaining substantial cost and carbon footprint reduction of \simeq 60%. Cache Saver is publicly available at [https://github.com/au-clan/cachesaver](https://github.com/au-clan/cachesaver).</abstract>
<identifier type="citekey">potamitis-etal-2025-cache</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1402</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1402/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25703</start>
<end>25724</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cache Saver: A Modular Framework for Efficient, Affordable, and Reproducible LLM Inference
%A Potamitis, Nearchos
%A Klein, Lars Henning
%A Mohammadi, Bardia
%A Xu, Chongyang
%A Mukherjee, Attreyee
%A Tandon, Niket
%A Bindschaedler, Laurent
%A Arora, Akhil
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F potamitis-etal-2025-cache
%X Inference constitutes the majority of costs throughout the lifecycle of a large language model (LLM). While numerous LLM inference engines focusing primarily on low-level optimizations have been developed, there is a scarcity of non-intrusive client-side frameworks that perform high-level optimizations. In this paper, we introduce Cache Saver, a modular, plug-and-play, and asynchronous framework that facilitates high-level inference optimizations, thereby integrating cleanly into existing systems without requiring changes to the end-user application logic or the underlying LLM. The key novelty is a *namespace-aware list-valued cache* that ensures *statistical integrity* of LLM responses by generating *i.i.d.* responses within a namespace as well as ensuring *reproducibility*. Moreover, as a direct consequence of operating at a high level, Cache Saver supports both local and online models. We conduct extensive experiments with five representative state-of-the-art reasoning strategies, five diverse benchmark tasks, and three different LLMs. On average across all methods, tasks, and LLMs, Cache Saver reduces cost by \simeq 25% and CO₂ by \simeq 35%. Notably, Cache Saver excels in practical machine learning scenarios such as benchmarking across multiple methods or conducting ablation analysis of a specific method, obtaining substantial cost and carbon footprint reduction of \simeq 60%. Cache Saver is publicly available at [https://github.com/au-clan/cachesaver](https://github.com/au-clan/cachesaver).
%R 10.18653/v1/2025.findings-emnlp.1402
%U https://aclanthology.org/2025.findings-emnlp.1402/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1402
%P 25703-25724
Markdown (Informal)
[Cache Saver: A Modular Framework for Efficient, Affordable, and Reproducible LLM Inference](https://aclanthology.org/2025.findings-emnlp.1402/) (Potamitis et al., Findings 2025)
ACL
- Nearchos Potamitis, Lars Henning Klein, Bardia Mohammadi, Chongyang Xu, Attreyee Mukherjee, Niket Tandon, Laurent Bindschaedler, and Akhil Arora. 2025. Cache Saver: A Modular Framework for Efficient, Affordable, and Reproducible LLM Inference. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 25703–25724, Suzhou, China. Association for Computational Linguistics.