@inproceedings{yen-etal-2024-long,
title = "Long-Context Language Modeling with Parallel Context Encoding",
author = "Yen, Howard and
Gao, Tianyu and
Chen, Danqi",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.142/",
doi = "10.18653/v1/2024.acl-long.142",
pages = "2588--2610",
abstract = "Extending large language models (LLMs) to process longer inputs is crucial for numerous applications. However, the considerable computational cost of transformers, coupled with limited generalization of positional encoding, restricts the size of their context window. We introduce Cross-Attention to Parallel Encodings (CAPE), a framework that can be applied to any existing decoder-only LLMs for context expansion. CAPE leverages a small encoder to process a long input chunk by chunk and enables the frozen decoder to cross-attend to the additional contexts. CAPE is efficient, generalizable, and versatile: trained with 8K-token documents, CAPE extends the context window of LLaMA-2 to 128K tokens, offering $10\times$ of the throughput with only 1/6 of the memory. CAPE yields strong performance on language modeling and in-context learning. CAPE also excels in retrieval-augmented applications, while existing long-context models degenerate with retrieved contexts. We further introduce a CAPE variant that can extend the context window of instruction-tuned models with only unlabeled data, and showcase its effectiveness on LLaMA-2-Chat, leading to a strong instruction-following model that can leverage very long context on downstream tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yen-etal-2024-long">
<titleInfo>
<title>Long-Context Language Modeling with Parallel Context Encoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Howard</namePart>
<namePart type="family">Yen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyu</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danqi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extending large language models (LLMs) to process longer inputs is crucial for numerous applications. However, the considerable computational cost of transformers, coupled with limited generalization of positional encoding, restricts the size of their context window. We introduce Cross-Attention to Parallel Encodings (CAPE), a framework that can be applied to any existing decoder-only LLMs for context expansion. CAPE leverages a small encoder to process a long input chunk by chunk and enables the frozen decoder to cross-attend to the additional contexts. CAPE is efficient, generalizable, and versatile: trained with 8K-token documents, CAPE extends the context window of LLaMA-2 to 128K tokens, offering 10\times of the throughput with only 1/6 of the memory. CAPE yields strong performance on language modeling and in-context learning. CAPE also excels in retrieval-augmented applications, while existing long-context models degenerate with retrieved contexts. We further introduce a CAPE variant that can extend the context window of instruction-tuned models with only unlabeled data, and showcase its effectiveness on LLaMA-2-Chat, leading to a strong instruction-following model that can leverage very long context on downstream tasks.</abstract>
<identifier type="citekey">yen-etal-2024-long</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.142</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.142/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>2588</start>
<end>2610</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Long-Context Language Modeling with Parallel Context Encoding
%A Yen, Howard
%A Gao, Tianyu
%A Chen, Danqi
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F yen-etal-2024-long
%X Extending large language models (LLMs) to process longer inputs is crucial for numerous applications. However, the considerable computational cost of transformers, coupled with limited generalization of positional encoding, restricts the size of their context window. We introduce Cross-Attention to Parallel Encodings (CAPE), a framework that can be applied to any existing decoder-only LLMs for context expansion. CAPE leverages a small encoder to process a long input chunk by chunk and enables the frozen decoder to cross-attend to the additional contexts. CAPE is efficient, generalizable, and versatile: trained with 8K-token documents, CAPE extends the context window of LLaMA-2 to 128K tokens, offering 10\times of the throughput with only 1/6 of the memory. CAPE yields strong performance on language modeling and in-context learning. CAPE also excels in retrieval-augmented applications, while existing long-context models degenerate with retrieved contexts. We further introduce a CAPE variant that can extend the context window of instruction-tuned models with only unlabeled data, and showcase its effectiveness on LLaMA-2-Chat, leading to a strong instruction-following model that can leverage very long context on downstream tasks.
%R 10.18653/v1/2024.acl-long.142
%U https://aclanthology.org/2024.luhme-long.142/
%U https://doi.org/10.18653/v1/2024.acl-long.142
%P 2588-2610
Markdown (Informal)
[Long-Context Language Modeling with Parallel Context Encoding](https://aclanthology.org/2024.luhme-long.142/) (Yen et al., ACL 2024)
ACL