@inproceedings{xie-etal-2025-simple,
title = "A Simple yet Efficient Prompt Compression Method for Text Classification Data Annotation Using {LLM}",
author = "Xie, Yiran and
Xiao, Debin and
Wang, Ping and
Liu, Shuming",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Darwish, Kareem and
Agarwal, Apoorv",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: Industry Track",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-industry.44/",
pages = "511--521",
abstract = "Effectively balancing accuracy and cost is a critical challenge when using large language models (LLMs) for corpus annotation. This paper introduces a novel compression method based on keyword extraction (PCKE) that effectively reduces the number of prompt tokens in text classification annotation tasks, with minimal to no loss in accuracy. Our approach begins with an LLM that generates both category labels and relevant keywords from a small unannotated dataset. These outputs are used to train a BERT-based multi-task model capable of simultaneous classification and keyword extraction. For larger unannotated corpora, this model extracts keywords which are then used in place of full texts for LLM annotation. The significant reduction in prompt tokens result in substantial cost savings. Furthermore, the use of a few well-chosen keywords ensures that classification accuracy is maintained. Extensive experiments validate that our method not only achieves a superior compression rate but also maintains high accuracy, outperforming existing general-purpose compression techniques. Our approach offers a practical and cost-efficient solution for large-scale text classification annotation using LLMs, particularly applicable in industrial settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xie-etal-2025-simple">
<titleInfo>
<title>A Simple yet Efficient Prompt Compression Method for Text Classification Data Annotation Using LLM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiran</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debin</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apoorv</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Effectively balancing accuracy and cost is a critical challenge when using large language models (LLMs) for corpus annotation. This paper introduces a novel compression method based on keyword extraction (PCKE) that effectively reduces the number of prompt tokens in text classification annotation tasks, with minimal to no loss in accuracy. Our approach begins with an LLM that generates both category labels and relevant keywords from a small unannotated dataset. These outputs are used to train a BERT-based multi-task model capable of simultaneous classification and keyword extraction. For larger unannotated corpora, this model extracts keywords which are then used in place of full texts for LLM annotation. The significant reduction in prompt tokens result in substantial cost savings. Furthermore, the use of a few well-chosen keywords ensures that classification accuracy is maintained. Extensive experiments validate that our method not only achieves a superior compression rate but also maintains high accuracy, outperforming existing general-purpose compression techniques. Our approach offers a practical and cost-efficient solution for large-scale text classification annotation using LLMs, particularly applicable in industrial settings.</abstract>
<identifier type="citekey">xie-etal-2025-simple</identifier>
<location>
<url>https://aclanthology.org/2025.coling-industry.44/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>511</start>
<end>521</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Simple yet Efficient Prompt Compression Method for Text Classification Data Annotation Using LLM
%A Xie, Yiran
%A Xiao, Debin
%A Wang, Ping
%A Liu, Shuming
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Darwish, Kareem
%Y Agarwal, Apoorv
%S Proceedings of the 31st International Conference on Computational Linguistics: Industry Track
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F xie-etal-2025-simple
%X Effectively balancing accuracy and cost is a critical challenge when using large language models (LLMs) for corpus annotation. This paper introduces a novel compression method based on keyword extraction (PCKE) that effectively reduces the number of prompt tokens in text classification annotation tasks, with minimal to no loss in accuracy. Our approach begins with an LLM that generates both category labels and relevant keywords from a small unannotated dataset. These outputs are used to train a BERT-based multi-task model capable of simultaneous classification and keyword extraction. For larger unannotated corpora, this model extracts keywords which are then used in place of full texts for LLM annotation. The significant reduction in prompt tokens result in substantial cost savings. Furthermore, the use of a few well-chosen keywords ensures that classification accuracy is maintained. Extensive experiments validate that our method not only achieves a superior compression rate but also maintains high accuracy, outperforming existing general-purpose compression techniques. Our approach offers a practical and cost-efficient solution for large-scale text classification annotation using LLMs, particularly applicable in industrial settings.
%U https://aclanthology.org/2025.coling-industry.44/
%P 511-521
Markdown (Informal)
[A Simple yet Efficient Prompt Compression Method for Text Classification Data Annotation Using LLM](https://aclanthology.org/2025.coling-industry.44/) (Xie et al., COLING 2025)
ACL