@inproceedings{deiseroth-etal-2024-divergent,
title = "Divergent Token Metrics: Measuring degradation to prune away {LLM} components {--} and optimize quantization",
author = {Deiseroth, Bj{\"o}rn and
Meuer, Max and
Gritsch, Nikolas and
Eichenberg, Constantin and
Schramowski, Patrick and
A{\ss}enmacher, Matthias and
Kersting, Kristian},
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.377",
doi = "10.18653/v1/2024.naacl-long.377",
pages = "6764--6783",
abstract = "Large Language Models (LLMs) have reshaped natural language processing with their impressive capabilities. However, their ever-increasing size has raised concerns about their effective deployment and the need for LLM compression. This study introduces the Divergent Token Metrics (DTMs), a novel approach to assessing compressed LLMs, addressing the limitations of traditional perplexity or accuracy measures that fail to accurately reflect text generation quality. DTMs measure token divergences that allow deeper insights into the subtleties of model compression, in particular, when evaluating components{'} impacts individually. Utilizing the First Divergent Token Metric (FDTM) in model sparsification reveals that 25{\%} of all attention components can be pruned beyond 90{\%} on the Llama-2 model family, still keeping SOTA performance. For quantization, FDTM suggests that more than 80{\%} of parameters can be naively transformed to int8 without special outlier management. These evaluations indicate the necessity of choosing appropriate compressions for parameters individually{---}and that FDTM can identify those{---}while standard metrics result in deteriorated outcomes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deiseroth-etal-2024-divergent">
<titleInfo>
<title>Divergent Token Metrics: Measuring degradation to prune away LLM components – and optimize quantization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Björn</namePart>
<namePart type="family">Deiseroth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Meuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolas</namePart>
<namePart type="family">Gritsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantin</namePart>
<namePart type="family">Eichenberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Schramowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Aßenmacher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="family">Kersting</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have reshaped natural language processing with their impressive capabilities. However, their ever-increasing size has raised concerns about their effective deployment and the need for LLM compression. This study introduces the Divergent Token Metrics (DTMs), a novel approach to assessing compressed LLMs, addressing the limitations of traditional perplexity or accuracy measures that fail to accurately reflect text generation quality. DTMs measure token divergences that allow deeper insights into the subtleties of model compression, in particular, when evaluating components’ impacts individually. Utilizing the First Divergent Token Metric (FDTM) in model sparsification reveals that 25% of all attention components can be pruned beyond 90% on the Llama-2 model family, still keeping SOTA performance. For quantization, FDTM suggests that more than 80% of parameters can be naively transformed to int8 without special outlier management. These evaluations indicate the necessity of choosing appropriate compressions for parameters individually—and that FDTM can identify those—while standard metrics result in deteriorated outcomes.</abstract>
<identifier type="citekey">deiseroth-etal-2024-divergent</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.377</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.377</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>6764</start>
<end>6783</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Divergent Token Metrics: Measuring degradation to prune away LLM components – and optimize quantization
%A Deiseroth, Björn
%A Meuer, Max
%A Gritsch, Nikolas
%A Eichenberg, Constantin
%A Schramowski, Patrick
%A Aßenmacher, Matthias
%A Kersting, Kristian
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F deiseroth-etal-2024-divergent
%X Large Language Models (LLMs) have reshaped natural language processing with their impressive capabilities. However, their ever-increasing size has raised concerns about their effective deployment and the need for LLM compression. This study introduces the Divergent Token Metrics (DTMs), a novel approach to assessing compressed LLMs, addressing the limitations of traditional perplexity or accuracy measures that fail to accurately reflect text generation quality. DTMs measure token divergences that allow deeper insights into the subtleties of model compression, in particular, when evaluating components’ impacts individually. Utilizing the First Divergent Token Metric (FDTM) in model sparsification reveals that 25% of all attention components can be pruned beyond 90% on the Llama-2 model family, still keeping SOTA performance. For quantization, FDTM suggests that more than 80% of parameters can be naively transformed to int8 without special outlier management. These evaluations indicate the necessity of choosing appropriate compressions for parameters individually—and that FDTM can identify those—while standard metrics result in deteriorated outcomes.
%R 10.18653/v1/2024.naacl-long.377
%U https://aclanthology.org/2024.naacl-long.377
%U https://doi.org/10.18653/v1/2024.naacl-long.377
%P 6764-6783
Markdown (Informal)
[Divergent Token Metrics: Measuring degradation to prune away LLM components – and optimize quantization](https://aclanthology.org/2024.naacl-long.377) (Deiseroth et al., NAACL 2024)
ACL