@inproceedings{quancai-etal-2025-discomp,
title = "{D}is{C}omp: A Two-Stage Prompt Optimization Framework Combining Task-Agnostic and Task-Aware Compression",
author = "Liu, Quancai and
Fan, Haihui and
Zhang, Jinchao and
Li, Xiangfang and
Li, Chuanrong and
Li, Bo",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.58/",
doi = "10.18653/v1/2025.findings-naacl.58",
pages = "1033--1044",
ISBN = "979-8-89176-195-7",
abstract = "Large language models (LLMs) exhibit exceptional performance across a wide range of natural language processing tasks, often relying on lengthy prompts to harness their full capabilities. However, extended prompts can lead to substantial computational overhead and increased hardware demands, limiting the scalability and efficiency of such models. In this paper, we propose DisComp, a two-stage prompt compression framework based on knowledge distillation that combines task-agnostic and task-aware strategies, designed to efficiently compress prompt length without compromising performance.In the first stage, task-agnostic compression is achieved through knowledge distillation, transferring the summarization capabilities of a LLM to a smaller, more efficient model. The distillation process combines cross-entropy loss and keyword matching loss to ensure the smaller model generates concise and informative summaries. In the second stage, sentence-level pruning is applied, where sentences are ranked by relevance to the query, and irrelevant sentences are pruned to retain only task-critical information. We evaluate our method on three benchmark datasets, LongBench , ZeroSCROLLS and NaturalQuestions. The results show that DisComp significantly outperforms previous task-agnostic and task-specific compression approaches, and it is up to 6.56{\texttimes} faster at inference compared to the best token-level compression method."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="quancai-etal-2025-discomp">
<titleInfo>
<title>DisComp: A Two-Stage Prompt Optimization Framework Combining Task-Agnostic and Task-Aware Compression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Quancai</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haihui</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinchao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangfang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuanrong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Large language models (LLMs) exhibit exceptional performance across a wide range of natural language processing tasks, often relying on lengthy prompts to harness their full capabilities. However, extended prompts can lead to substantial computational overhead and increased hardware demands, limiting the scalability and efficiency of such models. In this paper, we propose DisComp, a two-stage prompt compression framework based on knowledge distillation that combines task-agnostic and task-aware strategies, designed to efficiently compress prompt length without compromising performance.In the first stage, task-agnostic compression is achieved through knowledge distillation, transferring the summarization capabilities of a LLM to a smaller, more efficient model. The distillation process combines cross-entropy loss and keyword matching loss to ensure the smaller model generates concise and informative summaries. In the second stage, sentence-level pruning is applied, where sentences are ranked by relevance to the query, and irrelevant sentences are pruned to retain only task-critical information. We evaluate our method on three benchmark datasets, LongBench , ZeroSCROLLS and NaturalQuestions. The results show that DisComp significantly outperforms previous task-agnostic and task-specific compression approaches, and it is up to 6.56× faster at inference compared to the best token-level compression method.</abstract>
<identifier type="citekey">quancai-etal-2025-discomp</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.58</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.58/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1033</start>
<end>1044</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DisComp: A Two-Stage Prompt Optimization Framework Combining Task-Agnostic and Task-Aware Compression
%A Liu, Quancai
%A Fan, Haihui
%A Zhang, Jinchao
%A Li, Xiangfang
%A Li, Chuanrong
%A Li, Bo
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F quancai-etal-2025-discomp
%X Large language models (LLMs) exhibit exceptional performance across a wide range of natural language processing tasks, often relying on lengthy prompts to harness their full capabilities. However, extended prompts can lead to substantial computational overhead and increased hardware demands, limiting the scalability and efficiency of such models. In this paper, we propose DisComp, a two-stage prompt compression framework based on knowledge distillation that combines task-agnostic and task-aware strategies, designed to efficiently compress prompt length without compromising performance.In the first stage, task-agnostic compression is achieved through knowledge distillation, transferring the summarization capabilities of a LLM to a smaller, more efficient model. The distillation process combines cross-entropy loss and keyword matching loss to ensure the smaller model generates concise and informative summaries. In the second stage, sentence-level pruning is applied, where sentences are ranked by relevance to the query, and irrelevant sentences are pruned to retain only task-critical information. We evaluate our method on three benchmark datasets, LongBench , ZeroSCROLLS and NaturalQuestions. The results show that DisComp significantly outperforms previous task-agnostic and task-specific compression approaches, and it is up to 6.56× faster at inference compared to the best token-level compression method.
%R 10.18653/v1/2025.findings-naacl.58
%U https://aclanthology.org/2025.findings-naacl.58/
%U https://doi.org/10.18653/v1/2025.findings-naacl.58
%P 1033-1044
Markdown (Informal)
[DisComp: A Two-Stage Prompt Optimization Framework Combining Task-Agnostic and Task-Aware Compression](https://aclanthology.org/2025.findings-naacl.58/) (Liu et al., Findings 2025)
ACL