@inproceedings{velayuthan-etal-2024-back,
title = "Back to the Stats: Rescuing Low Resource Neural Machine Translation with Statistical Methods",
author = "Velayuthan, Menan and
Jayakody, Dilith and
De Silva, Nisansa and
Fernando, Aloka and
Ranathunga, Surangika",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.87",
pages = "901--907",
abstract = "This paper describes our submission to the WMT24 shared task for Low-Resource Languages of Spain in the Constrained task category. Due to the lack of deep learning-based data filtration methods for these languages, we propose a purely statistical-based, two-stage pipeline for data filtration. In the primary stage, we begin by removing spaces and punctuation from the source sentences (Spanish) and deduplicating them. We then filter out sentence pairs with inconsistent language predictions by the language identification model, followed by the removal of pairs with anomalous sentence length and word count ratios, using the development set statistics as the threshold. In the secondary stage, for corpora of significant size, we employ a Jensen Shannon divergence-based method to curate training data of the desired size. Our filtered data allowed us to complete a two-step training process in under 3 hours, with GPU power consumption kept below 1 kWh, making our system both economical and eco-friendly. The source code, training data, and best models are available on the project{'}s GitHub page.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="velayuthan-etal-2024-back">
<titleInfo>
<title>Back to the Stats: Rescuing Low Resource Neural Machine Translation with Statistical Methods</title>
</titleInfo>
<name type="personal">
<namePart type="given">Menan</namePart>
<namePart type="family">Velayuthan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilith</namePart>
<namePart type="family">Jayakody</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nisansa</namePart>
<namePart type="family">De Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aloka</namePart>
<namePart type="family">Fernando</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surangika</namePart>
<namePart type="family">Ranathunga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our submission to the WMT24 shared task for Low-Resource Languages of Spain in the Constrained task category. Due to the lack of deep learning-based data filtration methods for these languages, we propose a purely statistical-based, two-stage pipeline for data filtration. In the primary stage, we begin by removing spaces and punctuation from the source sentences (Spanish) and deduplicating them. We then filter out sentence pairs with inconsistent language predictions by the language identification model, followed by the removal of pairs with anomalous sentence length and word count ratios, using the development set statistics as the threshold. In the secondary stage, for corpora of significant size, we employ a Jensen Shannon divergence-based method to curate training data of the desired size. Our filtered data allowed us to complete a two-step training process in under 3 hours, with GPU power consumption kept below 1 kWh, making our system both economical and eco-friendly. The source code, training data, and best models are available on the project’s GitHub page.</abstract>
<identifier type="citekey">velayuthan-etal-2024-back</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.87</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>901</start>
<end>907</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Back to the Stats: Rescuing Low Resource Neural Machine Translation with Statistical Methods
%A Velayuthan, Menan
%A Jayakody, Dilith
%A De Silva, Nisansa
%A Fernando, Aloka
%A Ranathunga, Surangika
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F velayuthan-etal-2024-back
%X This paper describes our submission to the WMT24 shared task for Low-Resource Languages of Spain in the Constrained task category. Due to the lack of deep learning-based data filtration methods for these languages, we propose a purely statistical-based, two-stage pipeline for data filtration. In the primary stage, we begin by removing spaces and punctuation from the source sentences (Spanish) and deduplicating them. We then filter out sentence pairs with inconsistent language predictions by the language identification model, followed by the removal of pairs with anomalous sentence length and word count ratios, using the development set statistics as the threshold. In the secondary stage, for corpora of significant size, we employ a Jensen Shannon divergence-based method to curate training data of the desired size. Our filtered data allowed us to complete a two-step training process in under 3 hours, with GPU power consumption kept below 1 kWh, making our system both economical and eco-friendly. The source code, training data, and best models are available on the project’s GitHub page.
%U https://aclanthology.org/2024.wmt-1.87
%P 901-907
Markdown (Informal)
[Back to the Stats: Rescuing Low Resource Neural Machine Translation with Statistical Methods](https://aclanthology.org/2024.wmt-1.87) (Velayuthan et al., WMT 2024)
ACL