@inproceedings{schafer-etal-2024-effect,
title = "On the Effect of (Near) Duplicate Subwords in Language Modelling",
author = {Sch{\"a}fer, Anton and
Hofmann, Thomas and
Schlag, Imanol and
Pimentel, Tiago},
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.571",
doi = "10.18653/v1/2024.findings-acl.571",
pages = "9580--9597",
abstract = "Tokenisation is a core part of language models (LMs). It involves splitting a character sequence into subwords which are assigned random indices before being served to the LM. However, this process{---}while typically lossless{---}may lead to less efficient LM training, because it removes character-level information, thereby making it more difficult to generalise across similar subwords, such as *now* and *Now*. We refer to such subwords as **near duplicates**. In this paper, we study the impact of near duplicate subwords on LM training efficiency. First, we design an experiment that gives us an upper bound to how much we should expect a model to improve if we could perfectly generalise across near duplicates. We do this, by duplicating each token in our LM{'}s vocabulary, creating perfectly equivalent classes of subwords. Experimentally, we find that LMs need roughly 17{\%} more data when trained in a fully duplicated setting. Second, we investigate the impact of naturally occurring near duplicates on LMs. Here, we see that deduplicating them considerably hurts LM performance; but that this loss in performance can be easily mitigated.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schafer-etal-2024-effect">
<titleInfo>
<title>On the Effect of (Near) Duplicate Subwords in Language Modelling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Schäfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hofmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imanol</namePart>
<namePart type="family">Schlag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tokenisation is a core part of language models (LMs). It involves splitting a character sequence into subwords which are assigned random indices before being served to the LM. However, this process—while typically lossless—may lead to less efficient LM training, because it removes character-level information, thereby making it more difficult to generalise across similar subwords, such as *now* and *Now*. We refer to such subwords as **near duplicates**. In this paper, we study the impact of near duplicate subwords on LM training efficiency. First, we design an experiment that gives us an upper bound to how much we should expect a model to improve if we could perfectly generalise across near duplicates. We do this, by duplicating each token in our LM’s vocabulary, creating perfectly equivalent classes of subwords. Experimentally, we find that LMs need roughly 17% more data when trained in a fully duplicated setting. Second, we investigate the impact of naturally occurring near duplicates on LMs. Here, we see that deduplicating them considerably hurts LM performance; but that this loss in performance can be easily mitigated.</abstract>
<identifier type="citekey">schafer-etal-2024-effect</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.571</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.571</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>9580</start>
<end>9597</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Effect of (Near) Duplicate Subwords in Language Modelling
%A Schäfer, Anton
%A Hofmann, Thomas
%A Schlag, Imanol
%A Pimentel, Tiago
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F schafer-etal-2024-effect
%X Tokenisation is a core part of language models (LMs). It involves splitting a character sequence into subwords which are assigned random indices before being served to the LM. However, this process—while typically lossless—may lead to less efficient LM training, because it removes character-level information, thereby making it more difficult to generalise across similar subwords, such as *now* and *Now*. We refer to such subwords as **near duplicates**. In this paper, we study the impact of near duplicate subwords on LM training efficiency. First, we design an experiment that gives us an upper bound to how much we should expect a model to improve if we could perfectly generalise across near duplicates. We do this, by duplicating each token in our LM’s vocabulary, creating perfectly equivalent classes of subwords. Experimentally, we find that LMs need roughly 17% more data when trained in a fully duplicated setting. Second, we investigate the impact of naturally occurring near duplicates on LMs. Here, we see that deduplicating them considerably hurts LM performance; but that this loss in performance can be easily mitigated.
%R 10.18653/v1/2024.findings-acl.571
%U https://aclanthology.org/2024.findings-acl.571
%U https://doi.org/10.18653/v1/2024.findings-acl.571
%P 9580-9597
Markdown (Informal)
[On the Effect of (Near) Duplicate Subwords in Language Modelling](https://aclanthology.org/2024.findings-acl.571) (Schäfer et al., Findings 2024)
ACL