@inproceedings{tan-etal-2024-lloco,
title = "{LL}o{CO}: Learning Long Contexts Offline",
author = "Tan, Sijun and
Li, Xiuyu and
Patil, Shishir G and
Wu, Ziyang and
Zhang, Tianjun and
Keutzer, Kurt and
Gonzalez, Joseph and
Popa, Raluca",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.975",
pages = "17605--17621",
abstract = "Processing long contexts remains a challenge for large language models (LLMs) due to the quadratic computational and memory overhead of the self-attention mechanism and the substantial KV cache sizes during generation. We propose LLoCO, a novel approach to address this problem by learning contexts offline through context compression and in-domain parameter-efficient finetuning with LoRA. Our method enables an LLM to create a concise representation of the original context and efficiently retrieve relevant information to answer questions accurately. Our approach extends the effective context window of a 4k token LLaMA2-7B model to handle up to 128k tokens. We evaluate our approach on several long-context question-answering datasets, demonstrating that LLoCO significantly outperforms in-context learning while using $30 \times$ fewer tokens during inference. LLoCO achieves up to $7.62 \times$ speed-up during inference and $11.52 \times$ higher throughput during finetuning, substantially reduces the cost of long document question answering. This makes it a promising solution for efficient long context processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tan-etal-2024-lloco">
<titleInfo>
<title>LLoCO: Learning Long Contexts Offline</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sijun</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiuyu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shishir</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Patil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianjun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kurt</namePart>
<namePart type="family">Keutzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Gonzalez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raluca</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Processing long contexts remains a challenge for large language models (LLMs) due to the quadratic computational and memory overhead of the self-attention mechanism and the substantial KV cache sizes during generation. We propose LLoCO, a novel approach to address this problem by learning contexts offline through context compression and in-domain parameter-efficient finetuning with LoRA. Our method enables an LLM to create a concise representation of the original context and efficiently retrieve relevant information to answer questions accurately. Our approach extends the effective context window of a 4k token LLaMA2-7B model to handle up to 128k tokens. We evaluate our approach on several long-context question-answering datasets, demonstrating that LLoCO significantly outperforms in-context learning while using 30 \times fewer tokens during inference. LLoCO achieves up to 7.62 \times speed-up during inference and 11.52 \times higher throughput during finetuning, substantially reduces the cost of long document question answering. This makes it a promising solution for efficient long context processing.</abstract>
<identifier type="citekey">tan-etal-2024-lloco</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.975</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>17605</start>
<end>17621</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLoCO: Learning Long Contexts Offline
%A Tan, Sijun
%A Li, Xiuyu
%A Patil, Shishir G.
%A Wu, Ziyang
%A Zhang, Tianjun
%A Keutzer, Kurt
%A Gonzalez, Joseph
%A Popa, Raluca
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F tan-etal-2024-lloco
%X Processing long contexts remains a challenge for large language models (LLMs) due to the quadratic computational and memory overhead of the self-attention mechanism and the substantial KV cache sizes during generation. We propose LLoCO, a novel approach to address this problem by learning contexts offline through context compression and in-domain parameter-efficient finetuning with LoRA. Our method enables an LLM to create a concise representation of the original context and efficiently retrieve relevant information to answer questions accurately. Our approach extends the effective context window of a 4k token LLaMA2-7B model to handle up to 128k tokens. We evaluate our approach on several long-context question-answering datasets, demonstrating that LLoCO significantly outperforms in-context learning while using 30 \times fewer tokens during inference. LLoCO achieves up to 7.62 \times speed-up during inference and 11.52 \times higher throughput during finetuning, substantially reduces the cost of long document question answering. This makes it a promising solution for efficient long context processing.
%U https://aclanthology.org/2024.emnlp-main.975
%P 17605-17621
Markdown (Informal)
[LLoCO: Learning Long Contexts Offline](https://aclanthology.org/2024.emnlp-main.975) (Tan et al., EMNLP 2024)
ACL
- Sijun Tan, Xiuyu Li, Shishir G Patil, Ziyang Wu, Tianjun Zhang, Kurt Keutzer, Joseph Gonzalez, and Raluca Popa. 2024. LLoCO: Learning Long Contexts Offline. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 17605–17621, Miami, Florida, USA. Association for Computational Linguistics.