@inproceedings{ranaldi-etal-2024-far,
title = "How Far Does the Sequence of Compositions Impact Multilingual Pre-Training?",
author = "Ranaldi, Leonardo and
Pucci, Giulia and
Zanzotto, Fabio Massimo",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.86/",
pages = "796--804",
ISBN = "979-12-210-7060-6",
abstract = "The most efficient strategy for conducting pre-training of language models is the concatenation of contiguous sequences of text of fixed length through causal masking that estimates the probability of each token given its context.However, the role of the composition sequence pre-training technique in the models' generalization properties has yet to be explored.In this paper, we show that operating via causal masking impacts model performance because it could include misleading information from previous text sequences during pre-training.To fill this gap, we propose intra-context causal masking where the probability of each token is conditional only on the previous in the same chunk of text, avoiding misleading information from different contexts.Hence, we demonstrate that organizing text chunks based on a policy that aligns with text similarity effectively reduces the risk of misleading context during pre-training by enhancing language models' in-context learning and factual knowledge storage capabilities while maintaining efficiency."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ranaldi-etal-2024-far">
<titleInfo>
<title>How Far Does the Sequence of Compositions Impact Multilingual Pre-Training?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Ranaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Pucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="given">Massimo</namePart>
<namePart type="family">Zanzotto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>The most efficient strategy for conducting pre-training of language models is the concatenation of contiguous sequences of text of fixed length through causal masking that estimates the probability of each token given its context.However, the role of the composition sequence pre-training technique in the models’ generalization properties has yet to be explored.In this paper, we show that operating via causal masking impacts model performance because it could include misleading information from previous text sequences during pre-training.To fill this gap, we propose intra-context causal masking where the probability of each token is conditional only on the previous in the same chunk of text, avoiding misleading information from different contexts.Hence, we demonstrate that organizing text chunks based on a policy that aligns with text similarity effectively reduces the risk of misleading context during pre-training by enhancing language models’ in-context learning and factual knowledge storage capabilities while maintaining efficiency.</abstract>
<identifier type="citekey">ranaldi-etal-2024-far</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.86/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>796</start>
<end>804</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Far Does the Sequence of Compositions Impact Multilingual Pre-Training?
%A Ranaldi, Leonardo
%A Pucci, Giulia
%A Zanzotto, Fabio Massimo
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F ranaldi-etal-2024-far
%X The most efficient strategy for conducting pre-training of language models is the concatenation of contiguous sequences of text of fixed length through causal masking that estimates the probability of each token given its context.However, the role of the composition sequence pre-training technique in the models’ generalization properties has yet to be explored.In this paper, we show that operating via causal masking impacts model performance because it could include misleading information from previous text sequences during pre-training.To fill this gap, we propose intra-context causal masking where the probability of each token is conditional only on the previous in the same chunk of text, avoiding misleading information from different contexts.Hence, we demonstrate that organizing text chunks based on a policy that aligns with text similarity effectively reduces the risk of misleading context during pre-training by enhancing language models’ in-context learning and factual knowledge storage capabilities while maintaining efficiency.
%U https://aclanthology.org/2024.clicit-1.86/
%P 796-804
Markdown (Informal)
[How Far Does the Sequence of Compositions Impact Multilingual Pre-Training?](https://aclanthology.org/2024.clicit-1.86/) (Ranaldi et al., CLiC-it 2024)
ACL