@inproceedings{haddadi-teahan-2026-comparing,
title = "Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms",
author = "Haddadi, Mehran and
Teahan, William John",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.16/",
pages = "219--232",
ISBN = "979-8-89176-383-8",
abstract = "This work evaluates the non-English and unstructured text compression performance of Large Language Models (LLMs) by comparing them with traditional baselines on datasets from eight most widely spoken languages. Experimental results show that the evaluated LLM (LLaMA-3.2-1B) was considerably outperformed by the baselines, particularly on non-English datasets, where its performance relative to the best baseline was more than three times worse than on English datasets on average. It also compressed unstructured English data up to more than twofold less effectively than plain English data. Traditional methods, however, remained largely dataset-agnostic. Surprisingly, the LLM achieved worse compression ratios on some datasets than others despite modeling them more accurately. Overall, the outcomes and substantially higher compression time and resource consumption indicate that current LLMs are highly impractical for the compression task, where traditional methods continue to excel. Codes are available at: https://github.com/mehranhaddadi13/llm{\_}compress."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haddadi-teahan-2026-comparing">
<titleInfo>
<title>Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehran</namePart>
<namePart type="family">Haddadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">John</namePart>
<namePart type="family">Teahan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>This work evaluates the non-English and unstructured text compression performance of Large Language Models (LLMs) by comparing them with traditional baselines on datasets from eight most widely spoken languages. Experimental results show that the evaluated LLM (LLaMA-3.2-1B) was considerably outperformed by the baselines, particularly on non-English datasets, where its performance relative to the best baseline was more than three times worse than on English datasets on average. It also compressed unstructured English data up to more than twofold less effectively than plain English data. Traditional methods, however, remained largely dataset-agnostic. Surprisingly, the LLM achieved worse compression ratios on some datasets than others despite modeling them more accurately. Overall, the outcomes and substantially higher compression time and resource consumption indicate that current LLMs are highly impractical for the compression task, where traditional methods continue to excel. Codes are available at: https://github.com/mehranhaddadi13/llm_compress.</abstract>
<identifier type="citekey">haddadi-teahan-2026-comparing</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.16/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>219</start>
<end>232</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms
%A Haddadi, Mehran
%A Teahan, William John
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F haddadi-teahan-2026-comparing
%X This work evaluates the non-English and unstructured text compression performance of Large Language Models (LLMs) by comparing them with traditional baselines on datasets from eight most widely spoken languages. Experimental results show that the evaluated LLM (LLaMA-3.2-1B) was considerably outperformed by the baselines, particularly on non-English datasets, where its performance relative to the best baseline was more than three times worse than on English datasets on average. It also compressed unstructured English data up to more than twofold less effectively than plain English data. Traditional methods, however, remained largely dataset-agnostic. Surprisingly, the LLM achieved worse compression ratios on some datasets than others despite modeling them more accurately. Overall, the outcomes and substantially higher compression time and resource consumption indicate that current LLMs are highly impractical for the compression task, where traditional methods continue to excel. Codes are available at: https://github.com/mehranhaddadi13/llm_compress.
%U https://aclanthology.org/2026.eacl-srw.16/
%P 219-232
Markdown (Informal)
[Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms](https://aclanthology.org/2026.eacl-srw.16/) (Haddadi & Teahan, EACL 2026)
ACL