@inproceedings{ahmed-etal-2024-bangla,
title = "The {B}angla/{B}engali Seed Dataset Submission to the {WMT}24 Open Language Data Initiative Shared Task",
author = "Ahmed, Firoz and
Venkateswaran, Nitin and
Moeller, Sarah",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.42",
pages = "556--566",
abstract = "We contribute a seed dataset for the Bangla/Bengali language as part of the WMT24 Open Language Data Initiative shared task. We validate the quality of the dataset against a mined and automatically aligned dataset (NLLBv1) and two other existing datasets of crowdsourced manual translations. The validation is performed by investigating the performance of state-of-the-art translation models fine-tuned on the different datasets after controlling for training set size. Machine translation models fine-tuned on our dataset outperform models tuned on the other datasets in both translation directions (English-Bangla and Bangla-English). These results confirm the quality of our dataset. We hope our dataset will support machine translation for the Bangla/Bengali community and related low-resource languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmed-etal-2024-bangla">
<titleInfo>
<title>The Bangla/Bengali Seed Dataset Submission to the WMT24 Open Language Data Initiative Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoz</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitin</namePart>
<namePart type="family">Venkateswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Moeller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We contribute a seed dataset for the Bangla/Bengali language as part of the WMT24 Open Language Data Initiative shared task. We validate the quality of the dataset against a mined and automatically aligned dataset (NLLBv1) and two other existing datasets of crowdsourced manual translations. The validation is performed by investigating the performance of state-of-the-art translation models fine-tuned on the different datasets after controlling for training set size. Machine translation models fine-tuned on our dataset outperform models tuned on the other datasets in both translation directions (English-Bangla and Bangla-English). These results confirm the quality of our dataset. We hope our dataset will support machine translation for the Bangla/Bengali community and related low-resource languages.</abstract>
<identifier type="citekey">ahmed-etal-2024-bangla</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.42</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>556</start>
<end>566</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Bangla/Bengali Seed Dataset Submission to the WMT24 Open Language Data Initiative Shared Task
%A Ahmed, Firoz
%A Venkateswaran, Nitin
%A Moeller, Sarah
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ahmed-etal-2024-bangla
%X We contribute a seed dataset for the Bangla/Bengali language as part of the WMT24 Open Language Data Initiative shared task. We validate the quality of the dataset against a mined and automatically aligned dataset (NLLBv1) and two other existing datasets of crowdsourced manual translations. The validation is performed by investigating the performance of state-of-the-art translation models fine-tuned on the different datasets after controlling for training set size. Machine translation models fine-tuned on our dataset outperform models tuned on the other datasets in both translation directions (English-Bangla and Bangla-English). These results confirm the quality of our dataset. We hope our dataset will support machine translation for the Bangla/Bengali community and related low-resource languages.
%U https://aclanthology.org/2024.wmt-1.42
%P 556-566
Markdown (Informal)
[The Bangla/Bengali Seed Dataset Submission to the WMT24 Open Language Data Initiative Shared Task](https://aclanthology.org/2024.wmt-1.42) (Ahmed et al., WMT 2024)
ACL