@inproceedings{vilar-federico-2021-statistical,
title = "A Statistical Extension of Byte-Pair Encoding",
author = "Vilar, David and
Federico, Marcello",
editor = "Federico, Marcello and
Waibel, Alex and
Costa-juss{\`a}, Marta R. and
Niehues, Jan and
Stuker, Sebastian and
Salesky, Elizabeth",
booktitle = "Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)",
month = aug,
year = "2021",
address = "Bangkok, Thailand (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.iwslt-1.31",
doi = "10.18653/v1/2021.iwslt-1.31",
pages = "263--275",
abstract = "Sub-word segmentation is currently a standard tool for training neural machine translation (MT) systems and other NLP tasks. The goal is to split words (both in the source and target languages) into smaller units which then constitute the input and output vocabularies of the MT system. The aim of reducing the size of the input and output vocabularies is to increase the generalization capabilities of the translation model, enabling the system to translate and generate infrequent and new (unseen) words at inference time by combining previously seen sub-word units. Ideally, we would expect the created units to have some linguistic meaning, so that words are created in a compositional way. However, the most popular word-splitting method, Byte-Pair Encoding (BPE), which originates from the data compression literature, does not include explicit criteria to favor linguistic splittings nor to find the optimal sub-word granularity for the given training data. In this paper, we propose a statistically motivated extension of the BPE algorithm and an effective convergence criterion that avoids the costly experimentation cycle needed to select the best sub-word vocabulary size. Experimental results with morphologically rich languages show that our model achieves nearly-optimal BLEU scores and produces morphologically better word segmentations, which allows to outperform BPE{'}s generalization in the translation of sentences containing new words, as shown via human evaluation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vilar-federico-2021-statistical">
<titleInfo>
<title>A Statistical Extension of Byte-Pair Encoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Vilar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Waibel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Niehues</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Stuker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Salesky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sub-word segmentation is currently a standard tool for training neural machine translation (MT) systems and other NLP tasks. The goal is to split words (both in the source and target languages) into smaller units which then constitute the input and output vocabularies of the MT system. The aim of reducing the size of the input and output vocabularies is to increase the generalization capabilities of the translation model, enabling the system to translate and generate infrequent and new (unseen) words at inference time by combining previously seen sub-word units. Ideally, we would expect the created units to have some linguistic meaning, so that words are created in a compositional way. However, the most popular word-splitting method, Byte-Pair Encoding (BPE), which originates from the data compression literature, does not include explicit criteria to favor linguistic splittings nor to find the optimal sub-word granularity for the given training data. In this paper, we propose a statistically motivated extension of the BPE algorithm and an effective convergence criterion that avoids the costly experimentation cycle needed to select the best sub-word vocabulary size. Experimental results with morphologically rich languages show that our model achieves nearly-optimal BLEU scores and produces morphologically better word segmentations, which allows to outperform BPE’s generalization in the translation of sentences containing new words, as shown via human evaluation.</abstract>
<identifier type="citekey">vilar-federico-2021-statistical</identifier>
<identifier type="doi">10.18653/v1/2021.iwslt-1.31</identifier>
<location>
<url>https://aclanthology.org/2021.iwslt-1.31</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>263</start>
<end>275</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Statistical Extension of Byte-Pair Encoding
%A Vilar, David
%A Federico, Marcello
%Y Federico, Marcello
%Y Waibel, Alex
%Y Costa-jussà, Marta R.
%Y Niehues, Jan
%Y Stuker, Sebastian
%Y Salesky, Elizabeth
%S Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand (online)
%F vilar-federico-2021-statistical
%X Sub-word segmentation is currently a standard tool for training neural machine translation (MT) systems and other NLP tasks. The goal is to split words (both in the source and target languages) into smaller units which then constitute the input and output vocabularies of the MT system. The aim of reducing the size of the input and output vocabularies is to increase the generalization capabilities of the translation model, enabling the system to translate and generate infrequent and new (unseen) words at inference time by combining previously seen sub-word units. Ideally, we would expect the created units to have some linguistic meaning, so that words are created in a compositional way. However, the most popular word-splitting method, Byte-Pair Encoding (BPE), which originates from the data compression literature, does not include explicit criteria to favor linguistic splittings nor to find the optimal sub-word granularity for the given training data. In this paper, we propose a statistically motivated extension of the BPE algorithm and an effective convergence criterion that avoids the costly experimentation cycle needed to select the best sub-word vocabulary size. Experimental results with morphologically rich languages show that our model achieves nearly-optimal BLEU scores and produces morphologically better word segmentations, which allows to outperform BPE’s generalization in the translation of sentences containing new words, as shown via human evaluation.
%R 10.18653/v1/2021.iwslt-1.31
%U https://aclanthology.org/2021.iwslt-1.31
%U https://doi.org/10.18653/v1/2021.iwslt-1.31
%P 263-275
Markdown (Informal)
[A Statistical Extension of Byte-Pair Encoding](https://aclanthology.org/2021.iwslt-1.31) (Vilar & Federico, IWSLT 2021)
ACL
- David Vilar and Marcello Federico. 2021. A Statistical Extension of Byte-Pair Encoding. In Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021), pages 263–275, Bangkok, Thailand (online). Association for Computational Linguistics.