@inproceedings{siddiqui-etal-2025-llms,
title = "{LLM}s on a Budget? Say {HOLA}",
author = "Siddiqui, Zohaib Hasan and
Gao, Jiechao and
Shabbir, Ebad and
Azeez, Mohammad Anas and
Ali, Rafiq and
Kashyap, Gautam Siddharth and
Naseem, Usman",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.71/",
pages = "1035--1043",
ISBN = "979-8-89176-333-3",
abstract = "Running Large Language Models (LLMs) on edge devices is constrained by high compute and memory demands{---}posing a barrier for real-time applications in industries like healthcare, education, and embedded systems. Current solutions such as quantization, pruning, and Retrieval-Augmented Generation (RAG) offer only partial optimizations and often compromise on speed or accuracy. We introduce HOLA, an end-to-end optimization framework for efficient LLM deployment. Internally, it leverages Hierarchical Speculative Decoding (HSD) for faster inference without quality loss. Externally, AdaComp-RAG adjusts retrieval complexity based on context needs. Together with Lo-Bi, which blends structured pruning (LoRA) and quantization, HOLA delivers significant gains: +17.6{\%} EMA on GSM8K, +10.5{\%} MCA on ARC, and reduced latency and memory on edge devices like Jetson Nano{---}proving both scalable and production-ready. Our code is available at: https://github.com/zohaibhasan066/HOLA{\_}Codebase"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siddiqui-etal-2025-llms">
<titleInfo>
<title>LLMs on a Budget? Say HOLA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zohaib</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Siddiqui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiechao</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ebad</namePart>
<namePart type="family">Shabbir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Anas</namePart>
<namePart type="family">Azeez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafiq</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gautam</namePart>
<namePart type="given">Siddharth</namePart>
<namePart type="family">Kashyap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Naseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Running Large Language Models (LLMs) on edge devices is constrained by high compute and memory demands—posing a barrier for real-time applications in industries like healthcare, education, and embedded systems. Current solutions such as quantization, pruning, and Retrieval-Augmented Generation (RAG) offer only partial optimizations and often compromise on speed or accuracy. We introduce HOLA, an end-to-end optimization framework for efficient LLM deployment. Internally, it leverages Hierarchical Speculative Decoding (HSD) for faster inference without quality loss. Externally, AdaComp-RAG adjusts retrieval complexity based on context needs. Together with Lo-Bi, which blends structured pruning (LoRA) and quantization, HOLA delivers significant gains: +17.6% EMA on GSM8K, +10.5% MCA on ARC, and reduced latency and memory on edge devices like Jetson Nano—proving both scalable and production-ready. Our code is available at: https://github.com/zohaibhasan066/HOLA_Codebase</abstract>
<identifier type="citekey">siddiqui-etal-2025-llms</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.71/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1035</start>
<end>1043</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs on a Budget? Say HOLA
%A Siddiqui, Zohaib Hasan
%A Gao, Jiechao
%A Shabbir, Ebad
%A Azeez, Mohammad Anas
%A Ali, Rafiq
%A Kashyap, Gautam Siddharth
%A Naseem, Usman
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F siddiqui-etal-2025-llms
%X Running Large Language Models (LLMs) on edge devices is constrained by high compute and memory demands—posing a barrier for real-time applications in industries like healthcare, education, and embedded systems. Current solutions such as quantization, pruning, and Retrieval-Augmented Generation (RAG) offer only partial optimizations and often compromise on speed or accuracy. We introduce HOLA, an end-to-end optimization framework for efficient LLM deployment. Internally, it leverages Hierarchical Speculative Decoding (HSD) for faster inference without quality loss. Externally, AdaComp-RAG adjusts retrieval complexity based on context needs. Together with Lo-Bi, which blends structured pruning (LoRA) and quantization, HOLA delivers significant gains: +17.6% EMA on GSM8K, +10.5% MCA on ARC, and reduced latency and memory on edge devices like Jetson Nano—proving both scalable and production-ready. Our code is available at: https://github.com/zohaibhasan066/HOLA_Codebase
%U https://aclanthology.org/2025.emnlp-industry.71/
%P 1035-1043
Markdown (Informal)
[LLMs on a Budget? Say HOLA](https://aclanthology.org/2025.emnlp-industry.71/) (Siddiqui et al., EMNLP 2025)
ACL
- Zohaib Hasan Siddiqui, Jiechao Gao, Ebad Shabbir, Mohammad Anas Azeez, Rafiq Ali, Gautam Siddharth Kashyap, and Usman Naseem. 2025. LLMs on a Budget? Say HOLA. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1035–1043, Suzhou (China). Association for Computational Linguistics.