@inproceedings{song-etal-2024-submerge,
title = "{S}ub{M}erge: Merging Equivalent Subword Tokenizations for Subword Regularized Models in Neural Machine Translation",
author = "Song, Haiyue and
Meyer, Francois and
Dabre, Raj and
Tanaka, Hideki and
Chu, Chenhui and
Kurohashi, Sadao",
editor = "Scarton, Carolina and
Prescott, Charlotte and
Bayliss, Chris and
Oakley, Chris and
Wright, Joanna and
Wrigley, Stuart and
Song, Xingyi and
Gow-Smith, Edward and
Bawden, Rachel and
S{\'a}nchez-Cartagena, V{\'\i}ctor M and
Cadwell, Patrick and
Lapshinova-Koltunski, Ekaterina and
Cabarr{\~a}o, Vera and
Chatzitheodorou, Konstantinos and
Nurminen, Mary and
Kanojia, Diptesh and
Moniz, Helena",
booktitle = "Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)",
month = jun,
year = "2024",
address = "Sheffield, UK",
publisher = "European Association for Machine Translation (EAMT)",
url = "https://aclanthology.org/2024.eamt-1.15",
pages = "147--163",
abstract = "Subword regularized models leverage multiple subword tokenizations of one target sentence during training. However, selecting one tokenization during inference leads to the underutilization of knowledge learned about multiple tokenizations.We propose the SubMerge algorithm to rescue the ignored Subword tokenizations through merging equivalent ones during inference.SubMerge is a nested search algorithm where the outer beam search treats the word as the minimal unit, and the inner beam search provides a list of word candidates and their probabilities, merging equivalent subword tokenizations. SubMerge estimates the probability of the next word more precisely, providing better guidance during inference.Experimental results on six low-resource to high-resource machine translation datasets show that SubMerge utilizes a greater proportion of a model{'}s probability weight during decoding (lower word perplexities for hypotheses). It also improves BLEU and chrF++ scores for many translation directions, most reliably for low-resource scenarios. We investigate the effect of different beam sizes, training set sizes, dropout rates, and whether it is effective on non-regularized models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="song-etal-2024-submerge">
<titleInfo>
<title>SubMerge: Merging Equivalent Subword Tokenizations for Subword Regularized Models in Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haiyue</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francois</namePart>
<namePart type="family">Meyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="family">Dabre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hideki</namePart>
<namePart type="family">Tanaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhui</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Prescott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Bayliss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Oakley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joanna</namePart>
<namePart type="family">Wright</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stuart</namePart>
<namePart type="family">Wrigley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyi</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="family">Gow-Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachel</namePart>
<namePart type="family">Bawden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Sánchez-Cartagena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Cadwell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Lapshinova-Koltunski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Cabarrão</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantinos</namePart>
<namePart type="family">Chatzitheodorou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mary</namePart>
<namePart type="family">Nurminen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diptesh</namePart>
<namePart type="family">Kanojia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation (EAMT)</publisher>
<place>
<placeTerm type="text">Sheffield, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Subword regularized models leverage multiple subword tokenizations of one target sentence during training. However, selecting one tokenization during inference leads to the underutilization of knowledge learned about multiple tokenizations.We propose the SubMerge algorithm to rescue the ignored Subword tokenizations through merging equivalent ones during inference.SubMerge is a nested search algorithm where the outer beam search treats the word as the minimal unit, and the inner beam search provides a list of word candidates and their probabilities, merging equivalent subword tokenizations. SubMerge estimates the probability of the next word more precisely, providing better guidance during inference.Experimental results on six low-resource to high-resource machine translation datasets show that SubMerge utilizes a greater proportion of a model’s probability weight during decoding (lower word perplexities for hypotheses). It also improves BLEU and chrF++ scores for many translation directions, most reliably for low-resource scenarios. We investigate the effect of different beam sizes, training set sizes, dropout rates, and whether it is effective on non-regularized models.</abstract>
<identifier type="citekey">song-etal-2024-submerge</identifier>
<location>
<url>https://aclanthology.org/2024.eamt-1.15</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>147</start>
<end>163</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SubMerge: Merging Equivalent Subword Tokenizations for Subword Regularized Models in Neural Machine Translation
%A Song, Haiyue
%A Meyer, Francois
%A Dabre, Raj
%A Tanaka, Hideki
%A Chu, Chenhui
%A Kurohashi, Sadao
%Y Scarton, Carolina
%Y Prescott, Charlotte
%Y Bayliss, Chris
%Y Oakley, Chris
%Y Wright, Joanna
%Y Wrigley, Stuart
%Y Song, Xingyi
%Y Gow-Smith, Edward
%Y Bawden, Rachel
%Y Sánchez-Cartagena, Víctor M.
%Y Cadwell, Patrick
%Y Lapshinova-Koltunski, Ekaterina
%Y Cabarrão, Vera
%Y Chatzitheodorou, Konstantinos
%Y Nurminen, Mary
%Y Kanojia, Diptesh
%Y Moniz, Helena
%S Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)
%D 2024
%8 June
%I European Association for Machine Translation (EAMT)
%C Sheffield, UK
%F song-etal-2024-submerge
%X Subword regularized models leverage multiple subword tokenizations of one target sentence during training. However, selecting one tokenization during inference leads to the underutilization of knowledge learned about multiple tokenizations.We propose the SubMerge algorithm to rescue the ignored Subword tokenizations through merging equivalent ones during inference.SubMerge is a nested search algorithm where the outer beam search treats the word as the minimal unit, and the inner beam search provides a list of word candidates and their probabilities, merging equivalent subword tokenizations. SubMerge estimates the probability of the next word more precisely, providing better guidance during inference.Experimental results on six low-resource to high-resource machine translation datasets show that SubMerge utilizes a greater proportion of a model’s probability weight during decoding (lower word perplexities for hypotheses). It also improves BLEU and chrF++ scores for many translation directions, most reliably for low-resource scenarios. We investigate the effect of different beam sizes, training set sizes, dropout rates, and whether it is effective on non-regularized models.
%U https://aclanthology.org/2024.eamt-1.15
%P 147-163
Markdown (Informal)
[SubMerge: Merging Equivalent Subword Tokenizations for Subword Regularized Models in Neural Machine Translation](https://aclanthology.org/2024.eamt-1.15) (Song et al., EAMT 2024)
ACL