@inproceedings{sk-etal-2025-llm,
title = "{LLM} Compression: How Far Can We Go in Balancing Size and Performance?",
author = "Sk, Sahil and
Dhal, Debashish and
Khosla, Sonal and
Dhaka, Akash and
Parida, Shantipriya and
Shahid, Sk and
Shekhar, Sambit and
Prasad, Dilip and
Bojar, Ondrej",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.136/",
pages = "1183--1187",
abstract = "Quantization is an essential and popular technique for improving the accessibility of large language models (LLMs) by reducing memory usage and computational costs while maintaining performance. In this study, we apply 4-bit Group Scaling Quantization (GSQ) and Generative Pretrained Transformer Quantization (GPTQ) to LLaMA 1B, Qwen 0.5B, and PHI 1.5B, evaluating their impact across multiple NLP tasks. We benchmark these models on MS MARCO (Information Retrieval), BoolQ (Boolean Question Answering), and GSM8K (Mathematical Reasoning) datasets, assessing both accuracy and efficiency accross various tasks. The study measures the trade-offs between model compression and task performance, analyzing key evaluation metrics namely: accuracy, inference latency, and throughput, providing insights into the suitability of low-bit quantization for real-world deployment and highlight the tradeoffs between memory, computing and latency in such settings, helping a user make suitable decisions"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sk-etal-2025-llm">
<titleInfo>
<title>LLM Compression: How Far Can We Go in Balancing Size and Performance?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sahil</namePart>
<namePart type="family">Sk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debashish</namePart>
<namePart type="family">Dhal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sonal</namePart>
<namePart type="family">Khosla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akash</namePart>
<namePart type="family">Dhaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shantipriya</namePart>
<namePart type="family">Parida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sk</namePart>
<namePart type="family">Shahid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sambit</namePart>
<namePart type="family">Shekhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilip</namePart>
<namePart type="family">Prasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Quantization is an essential and popular technique for improving the accessibility of large language models (LLMs) by reducing memory usage and computational costs while maintaining performance. In this study, we apply 4-bit Group Scaling Quantization (GSQ) and Generative Pretrained Transformer Quantization (GPTQ) to LLaMA 1B, Qwen 0.5B, and PHI 1.5B, evaluating their impact across multiple NLP tasks. We benchmark these models on MS MARCO (Information Retrieval), BoolQ (Boolean Question Answering), and GSM8K (Mathematical Reasoning) datasets, assessing both accuracy and efficiency accross various tasks. The study measures the trade-offs between model compression and task performance, analyzing key evaluation metrics namely: accuracy, inference latency, and throughput, providing insights into the suitability of low-bit quantization for real-world deployment and highlight the tradeoffs between memory, computing and latency in such settings, helping a user make suitable decisions</abstract>
<identifier type="citekey">sk-etal-2025-llm</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.136/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1183</start>
<end>1187</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM Compression: How Far Can We Go in Balancing Size and Performance?
%A Sk, Sahil
%A Dhal, Debashish
%A Khosla, Sonal
%A Dhaka, Akash
%A Parida, Shantipriya
%A Shahid, Sk
%A Shekhar, Sambit
%A Prasad, Dilip
%A Bojar, Ondrej
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F sk-etal-2025-llm
%X Quantization is an essential and popular technique for improving the accessibility of large language models (LLMs) by reducing memory usage and computational costs while maintaining performance. In this study, we apply 4-bit Group Scaling Quantization (GSQ) and Generative Pretrained Transformer Quantization (GPTQ) to LLaMA 1B, Qwen 0.5B, and PHI 1.5B, evaluating their impact across multiple NLP tasks. We benchmark these models on MS MARCO (Information Retrieval), BoolQ (Boolean Question Answering), and GSM8K (Mathematical Reasoning) datasets, assessing both accuracy and efficiency accross various tasks. The study measures the trade-offs between model compression and task performance, analyzing key evaluation metrics namely: accuracy, inference latency, and throughput, providing insights into the suitability of low-bit quantization for real-world deployment and highlight the tradeoffs between memory, computing and latency in such settings, helping a user make suitable decisions
%U https://aclanthology.org/2025.ranlp-1.136/
%P 1183-1187
Markdown (Informal)
[LLM Compression: How Far Can We Go in Balancing Size and Performance?](https://aclanthology.org/2025.ranlp-1.136/) (Sk et al., RANLP 2025)
ACL
- Sahil Sk, Debashish Dhal, Sonal Khosla, Akash Dhaka, Shantipriya Parida, Sk Shahid, Sambit Shekhar, Dilip Prasad, and Ondrej Bojar. 2025. LLM Compression: How Far Can We Go in Balancing Size and Performance?. In Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era, pages 1183–1187, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.