@inproceedings{namburi-etal-2023-cost,
title = "The Cost of Compression: Investigating the Impact of Compression on Parametric Knowledge in Language Models",
author = "Namburi, Satya Sai Srinath and
Sreedhar, Makesh and
Srinivasan, Srinath and
Sala, Frederic",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.349",
doi = "10.18653/v1/2023.findings-emnlp.349",
pages = "5255--5273",
abstract = "Compressing large language models (LLMs), often consisting of billions of parameters, provides faster inference, smaller memory footprints, and enables local deployment. The standard compression techniques are pruning and quantization, with the former eliminating redundant connections in model layers and the latter representing model parameters with as little as 4 bits. The key tradeoff is between the degree of compression and the impact on the quality of the compressed model. Existing research on LLM compression primarily focuses on performance in terms of general metrics like perplexity or downstream task accuracy. More fine-grained metrics, such as those measuring parametric knowledge, remain significantly underexplored. To help bridge this gap, we present a comprehensive analysis across multiple model families using the LAMA and LM-Harness benchmarks in order to systematically quantify the effect of commonly employed compression techniques on model performance. A particular focus is on tradeoffs involving parametric knowledge, with the goal of providing practitioners with practical insights to make informed decisions on compression.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="namburi-etal-2023-cost">
<titleInfo>
<title>The Cost of Compression: Investigating the Impact of Compression on Parametric Knowledge in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Satya</namePart>
<namePart type="given">Sai</namePart>
<namePart type="given">Srinath</namePart>
<namePart type="family">Namburi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makesh</namePart>
<namePart type="family">Sreedhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srinath</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Sala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Compressing large language models (LLMs), often consisting of billions of parameters, provides faster inference, smaller memory footprints, and enables local deployment. The standard compression techniques are pruning and quantization, with the former eliminating redundant connections in model layers and the latter representing model parameters with as little as 4 bits. The key tradeoff is between the degree of compression and the impact on the quality of the compressed model. Existing research on LLM compression primarily focuses on performance in terms of general metrics like perplexity or downstream task accuracy. More fine-grained metrics, such as those measuring parametric knowledge, remain significantly underexplored. To help bridge this gap, we present a comprehensive analysis across multiple model families using the LAMA and LM-Harness benchmarks in order to systematically quantify the effect of commonly employed compression techniques on model performance. A particular focus is on tradeoffs involving parametric knowledge, with the goal of providing practitioners with practical insights to make informed decisions on compression.</abstract>
<identifier type="citekey">namburi-etal-2023-cost</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.349</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.349</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>5255</start>
<end>5273</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Cost of Compression: Investigating the Impact of Compression on Parametric Knowledge in Language Models
%A Namburi, Satya Sai Srinath
%A Sreedhar, Makesh
%A Srinivasan, Srinath
%A Sala, Frederic
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F namburi-etal-2023-cost
%X Compressing large language models (LLMs), often consisting of billions of parameters, provides faster inference, smaller memory footprints, and enables local deployment. The standard compression techniques are pruning and quantization, with the former eliminating redundant connections in model layers and the latter representing model parameters with as little as 4 bits. The key tradeoff is between the degree of compression and the impact on the quality of the compressed model. Existing research on LLM compression primarily focuses on performance in terms of general metrics like perplexity or downstream task accuracy. More fine-grained metrics, such as those measuring parametric knowledge, remain significantly underexplored. To help bridge this gap, we present a comprehensive analysis across multiple model families using the LAMA and LM-Harness benchmarks in order to systematically quantify the effect of commonly employed compression techniques on model performance. A particular focus is on tradeoffs involving parametric knowledge, with the goal of providing practitioners with practical insights to make informed decisions on compression.
%R 10.18653/v1/2023.findings-emnlp.349
%U https://aclanthology.org/2023.findings-emnlp.349
%U https://doi.org/10.18653/v1/2023.findings-emnlp.349
%P 5255-5273
Markdown (Informal)
[The Cost of Compression: Investigating the Impact of Compression on Parametric Knowledge in Language Models](https://aclanthology.org/2023.findings-emnlp.349) (Namburi et al., Findings 2023)
ACL