@inproceedings{kumar-thawani-2022-bpe,
title = "{BPE} beyond Word Boundary: How {NOT} to use Multi Word Expressions in Neural Machine Translation",
author = "Kumar, Dipesh and
Thawani, Avijit",
editor = "Tafreshi, Shabnam and
Sedoc, Jo{\~a}o and
Rogers, Anna and
Drozd, Aleksandr and
Rumshisky, Anna and
Akula, Arjun",
booktitle = "Proceedings of the Third Workshop on Insights from Negative Results in NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.insights-1.24",
doi = "10.18653/v1/2022.insights-1.24",
pages = "172--179",
abstract = "BPE tokenization merges characters into longer tokens by finding frequently occurring \textbf{contiguous} patterns \textbf{within} the word boundary. An intuitive relaxation would be to extend a BPE vocabulary with multi-word expressions (MWEs): bigrams ($in\_a$), trigrams ($out\_of\_the$), and skip-grams ($he . his$). In the context of Neural Machine Translation (NMT), we replace the least frequent subword/whole-word tokens with the most frequent MWEs. We find that these modifications to BPE end up hurting the model, resulting in a net drop of BLEU and chrF scores across two language pairs. We observe that naively extending BPE beyond word boundaries results in incoherent tokens which are themselves better represented as individual words. Moreover, we find that Pointwise Mutual Information (PMI) instead of frequency finds better MWEs (e.g., $New\_York$, $Statue\_of\_Liberty$, $neither . nor$) which consistently improves translation performance.We release all code at \url{https://github.com/pegasus-lynx/mwe-bpe}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-thawani-2022-bpe">
<titleInfo>
<title>BPE beyond Word Boundary: How NOT to use Multi Word Expressions in Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dipesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Thawani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Insights from Negative Results in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shabnam</namePart>
<namePart type="family">Tafreshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksandr</namePart>
<namePart type="family">Drozd</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Akula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>BPE tokenization merges characters into longer tokens by finding frequently occurring contiguous patterns within the word boundary. An intuitive relaxation would be to extend a BPE vocabulary with multi-word expressions (MWEs): bigrams (in_a), trigrams (out_of_the), and skip-grams (he . his). In the context of Neural Machine Translation (NMT), we replace the least frequent subword/whole-word tokens with the most frequent MWEs. We find that these modifications to BPE end up hurting the model, resulting in a net drop of BLEU and chrF scores across two language pairs. We observe that naively extending BPE beyond word boundaries results in incoherent tokens which are themselves better represented as individual words. Moreover, we find that Pointwise Mutual Information (PMI) instead of frequency finds better MWEs (e.g., New_York, Statue_of_Liberty, neither . nor) which consistently improves translation performance.We release all code at https://github.com/pegasus-lynx/mwe-bpe.</abstract>
<identifier type="citekey">kumar-thawani-2022-bpe</identifier>
<identifier type="doi">10.18653/v1/2022.insights-1.24</identifier>
<location>
<url>https://aclanthology.org/2022.insights-1.24</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>172</start>
<end>179</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BPE beyond Word Boundary: How NOT to use Multi Word Expressions in Neural Machine Translation
%A Kumar, Dipesh
%A Thawani, Avijit
%Y Tafreshi, Shabnam
%Y Sedoc, João
%Y Rogers, Anna
%Y Drozd, Aleksandr
%Y Rumshisky, Anna
%Y Akula, Arjun
%S Proceedings of the Third Workshop on Insights from Negative Results in NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F kumar-thawani-2022-bpe
%X BPE tokenization merges characters into longer tokens by finding frequently occurring contiguous patterns within the word boundary. An intuitive relaxation would be to extend a BPE vocabulary with multi-word expressions (MWEs): bigrams (in_a), trigrams (out_of_the), and skip-grams (he . his). In the context of Neural Machine Translation (NMT), we replace the least frequent subword/whole-word tokens with the most frequent MWEs. We find that these modifications to BPE end up hurting the model, resulting in a net drop of BLEU and chrF scores across two language pairs. We observe that naively extending BPE beyond word boundaries results in incoherent tokens which are themselves better represented as individual words. Moreover, we find that Pointwise Mutual Information (PMI) instead of frequency finds better MWEs (e.g., New_York, Statue_of_Liberty, neither . nor) which consistently improves translation performance.We release all code at https://github.com/pegasus-lynx/mwe-bpe.
%R 10.18653/v1/2022.insights-1.24
%U https://aclanthology.org/2022.insights-1.24
%U https://doi.org/10.18653/v1/2022.insights-1.24
%P 172-179
Markdown (Informal)
[BPE beyond Word Boundary: How NOT to use Multi Word Expressions in Neural Machine Translation](https://aclanthology.org/2022.insights-1.24) (Kumar & Thawani, insights 2022)
ACL