@inproceedings{adlaon-marcos-2024-finding,
title = "Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting",
author = "Adlaon, Kristine and
Marcos, Nelson",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.860",
pages = "14673--14682",
abstract = "This paper investigates the impact of different Byte Pair Encoding (BPE) configurations, specifically, merge operations on neural machine translation (NMT) performance for the Filipino-Cebuano language pair across various text domains. Results demonstrate that smaller BPE configurations, notably 2k, 5k, and 8k consistently yield higher BLEU scores, indicating improved translation quality through finer tokenization granularity. Conversely, larger BPE configurations and the absence of BPE result in lower BLEU scores, suggesting a decline in translation quality due to coarser tokenization. Additionally, these findings help us understand how the size of the model and how finely we break down words affect the quality of translations. This knowledge will be useful for improving translation systems, especially for languages that don{'}t have many parallel texts available for training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="adlaon-marcos-2024-finding">
<titleInfo>
<title>Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristine</namePart>
<namePart type="family">Adlaon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nelson</namePart>
<namePart type="family">Marcos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper investigates the impact of different Byte Pair Encoding (BPE) configurations, specifically, merge operations on neural machine translation (NMT) performance for the Filipino-Cebuano language pair across various text domains. Results demonstrate that smaller BPE configurations, notably 2k, 5k, and 8k consistently yield higher BLEU scores, indicating improved translation quality through finer tokenization granularity. Conversely, larger BPE configurations and the absence of BPE result in lower BLEU scores, suggesting a decline in translation quality due to coarser tokenization. Additionally, these findings help us understand how the size of the model and how finely we break down words affect the quality of translations. This knowledge will be useful for improving translation systems, especially for languages that don’t have many parallel texts available for training.</abstract>
<identifier type="citekey">adlaon-marcos-2024-finding</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.860</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>14673</start>
<end>14682</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting
%A Adlaon, Kristine
%A Marcos, Nelson
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F adlaon-marcos-2024-finding
%X This paper investigates the impact of different Byte Pair Encoding (BPE) configurations, specifically, merge operations on neural machine translation (NMT) performance for the Filipino-Cebuano language pair across various text domains. Results demonstrate that smaller BPE configurations, notably 2k, 5k, and 8k consistently yield higher BLEU scores, indicating improved translation quality through finer tokenization granularity. Conversely, larger BPE configurations and the absence of BPE result in lower BLEU scores, suggesting a decline in translation quality due to coarser tokenization. Additionally, these findings help us understand how the size of the model and how finely we break down words affect the quality of translations. This knowledge will be useful for improving translation systems, especially for languages that don’t have many parallel texts available for training.
%U https://aclanthology.org/2024.findings-emnlp.860
%P 14673-14682
Markdown (Informal)
[Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting](https://aclanthology.org/2024.findings-emnlp.860) (Adlaon & Marcos, Findings 2024)
ACL