@inproceedings{luo-etal-2024-taking,
title = "Taking a Deep Breath: Enhancing Language Modeling of Large Language Models with Sentinel Tokens",
author = "Luo, Weiyao and
Zheng, Suncong and
Xia, Heming and
Wang, Weikang and
Lei, Yan and
Liu, Tianyu and
Chen, Shuang and
Sui, Zhifang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.233",
pages = "4034--4040",
abstract = "Large language models (LLMs) have shown promising efficacy across various tasks, becoming powerful tools in numerous aspects of human life. However, Transformer-based LLMs suffer a performance degradation when modeling long-term contexts due to they discard some information to reduce computational overhead. In this work, we propose a simple yet effective method to enable LLMs to take a deep breath, encouraging them to summarize information contained within discrete text chunks. Specifically, we segment the text into multiple chunks and insert special token {\textless}SR{\textgreater} at the end of each chunk. We then modify the attention mask to integrate the chunk{'}s information into the corresponding {\textless}SR{\textgreater} token. This facilitates LLMs to interpret information not only from historical individual tokens but also from the {\textless}SR{\textgreater} token, aggregating the chunk{'}s semantic information. Experiments on language modeling and out-of-domain downstream tasks validate the superiority of our approach.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luo-etal-2024-taking">
<titleInfo>
<title>Taking a Deep Breath: Enhancing Language Modeling of Large Language Models with Sentinel Tokens</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weiyao</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suncong</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heming</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weikang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhifang</namePart>
<namePart type="family">Sui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) have shown promising efficacy across various tasks, becoming powerful tools in numerous aspects of human life. However, Transformer-based LLMs suffer a performance degradation when modeling long-term contexts due to they discard some information to reduce computational overhead. In this work, we propose a simple yet effective method to enable LLMs to take a deep breath, encouraging them to summarize information contained within discrete text chunks. Specifically, we segment the text into multiple chunks and insert special token \textlessSR\textgreater at the end of each chunk. We then modify the attention mask to integrate the chunk’s information into the corresponding \textlessSR\textgreater token. This facilitates LLMs to interpret information not only from historical individual tokens but also from the \textlessSR\textgreater token, aggregating the chunk’s semantic information. Experiments on language modeling and out-of-domain downstream tasks validate the superiority of our approach.</abstract>
<identifier type="citekey">luo-etal-2024-taking</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.233</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>4034</start>
<end>4040</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Taking a Deep Breath: Enhancing Language Modeling of Large Language Models with Sentinel Tokens
%A Luo, Weiyao
%A Zheng, Suncong
%A Xia, Heming
%A Wang, Weikang
%A Lei, Yan
%A Liu, Tianyu
%A Chen, Shuang
%A Sui, Zhifang
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F luo-etal-2024-taking
%X Large language models (LLMs) have shown promising efficacy across various tasks, becoming powerful tools in numerous aspects of human life. However, Transformer-based LLMs suffer a performance degradation when modeling long-term contexts due to they discard some information to reduce computational overhead. In this work, we propose a simple yet effective method to enable LLMs to take a deep breath, encouraging them to summarize information contained within discrete text chunks. Specifically, we segment the text into multiple chunks and insert special token \textlessSR\textgreater at the end of each chunk. We then modify the attention mask to integrate the chunk’s information into the corresponding \textlessSR\textgreater token. This facilitates LLMs to interpret information not only from historical individual tokens but also from the \textlessSR\textgreater token, aggregating the chunk’s semantic information. Experiments on language modeling and out-of-domain downstream tasks validate the superiority of our approach.
%U https://aclanthology.org/2024.findings-emnlp.233
%P 4034-4040
Markdown (Informal)
[Taking a Deep Breath: Enhancing Language Modeling of Large Language Models with Sentinel Tokens](https://aclanthology.org/2024.findings-emnlp.233) (Luo et al., Findings 2024)
ACL
- Weiyao Luo, Suncong Zheng, Heming Xia, Weikang Wang, Yan Lei, Tianyu Liu, Shuang Chen, and Zhifang Sui. 2024. Taking a Deep Breath: Enhancing Language Modeling of Large Language Models with Sentinel Tokens. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 4034–4040, Miami, Florida, USA. Association for Computational Linguistics.