@inproceedings{korotkova-fishel-2024-estonian,
title = "{E}stonian-Centric Machine Translation: Data, Models, and Challenges",
author = "Korotkova, Elizaveta and
Fishel, Mark",
editor = "Scarton, Carolina and
Prescott, Charlotte and
Bayliss, Chris and
Oakley, Chris and
Wright, Joanna and
Wrigley, Stuart and
Song, Xingyi and
Gow-Smith, Edward and
Bawden, Rachel and
S{\'a}nchez-Cartagena, V{\'\i}ctor M and
Cadwell, Patrick and
Lapshinova-Koltunski, Ekaterina and
Cabarr{\~a}o, Vera and
Chatzitheodorou, Konstantinos and
Nurminen, Mary and
Kanojia, Diptesh and
Moniz, Helena",
booktitle = "Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)",
month = jun,
year = "2024",
address = "Sheffield, UK",
publisher = "European Association for Machine Translation (EAMT)",
url = "https://aclanthology.org/2024.eamt-1.55",
pages = "647--660",
abstract = "Machine translation (MT) research is most typically English-centric. In recent years, massively multilingual translation systems have also been increasingly popular. However, efforts purposefully focused on less-resourced languages are less widespread. In this paper, we focus on MT from and into the Estonian language. First, emphasizing the importance of data availability, we generate and publicly release a back-translation corpus of over 2 billion sentence pairs. Second, using these novel data, we create MT models covering 18 translation directions, all either from or into Estonian. We re-use the encoder of the NLLB multilingual model and train modular decoders separately for each language, surpassing the original NLLB quality. Our resulting MT models largely outperform other open-source MT systems, including previous Estonian-focused efforts, and are released as part of this submission.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="korotkova-fishel-2024-estonian">
<titleInfo>
<title>Estonian-Centric Machine Translation: Data, Models, and Challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizaveta</namePart>
<namePart type="family">Korotkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Prescott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Bayliss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Oakley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joanna</namePart>
<namePart type="family">Wright</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stuart</namePart>
<namePart type="family">Wrigley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyi</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="family">Gow-Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachel</namePart>
<namePart type="family">Bawden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Sánchez-Cartagena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Cadwell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Lapshinova-Koltunski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Cabarrão</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantinos</namePart>
<namePart type="family">Chatzitheodorou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mary</namePart>
<namePart type="family">Nurminen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diptesh</namePart>
<namePart type="family">Kanojia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation (EAMT)</publisher>
<place>
<placeTerm type="text">Sheffield, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Machine translation (MT) research is most typically English-centric. In recent years, massively multilingual translation systems have also been increasingly popular. However, efforts purposefully focused on less-resourced languages are less widespread. In this paper, we focus on MT from and into the Estonian language. First, emphasizing the importance of data availability, we generate and publicly release a back-translation corpus of over 2 billion sentence pairs. Second, using these novel data, we create MT models covering 18 translation directions, all either from or into Estonian. We re-use the encoder of the NLLB multilingual model and train modular decoders separately for each language, surpassing the original NLLB quality. Our resulting MT models largely outperform other open-source MT systems, including previous Estonian-focused efforts, and are released as part of this submission.</abstract>
<identifier type="citekey">korotkova-fishel-2024-estonian</identifier>
<location>
<url>https://aclanthology.org/2024.eamt-1.55</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>647</start>
<end>660</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Estonian-Centric Machine Translation: Data, Models, and Challenges
%A Korotkova, Elizaveta
%A Fishel, Mark
%Y Scarton, Carolina
%Y Prescott, Charlotte
%Y Bayliss, Chris
%Y Oakley, Chris
%Y Wright, Joanna
%Y Wrigley, Stuart
%Y Song, Xingyi
%Y Gow-Smith, Edward
%Y Bawden, Rachel
%Y Sánchez-Cartagena, Víctor M.
%Y Cadwell, Patrick
%Y Lapshinova-Koltunski, Ekaterina
%Y Cabarrão, Vera
%Y Chatzitheodorou, Konstantinos
%Y Nurminen, Mary
%Y Kanojia, Diptesh
%Y Moniz, Helena
%S Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)
%D 2024
%8 June
%I European Association for Machine Translation (EAMT)
%C Sheffield, UK
%F korotkova-fishel-2024-estonian
%X Machine translation (MT) research is most typically English-centric. In recent years, massively multilingual translation systems have also been increasingly popular. However, efforts purposefully focused on less-resourced languages are less widespread. In this paper, we focus on MT from and into the Estonian language. First, emphasizing the importance of data availability, we generate and publicly release a back-translation corpus of over 2 billion sentence pairs. Second, using these novel data, we create MT models covering 18 translation directions, all either from or into Estonian. We re-use the encoder of the NLLB multilingual model and train modular decoders separately for each language, surpassing the original NLLB quality. Our resulting MT models largely outperform other open-source MT systems, including previous Estonian-focused efforts, and are released as part of this submission.
%U https://aclanthology.org/2024.eamt-1.55
%P 647-660
Markdown (Informal)
[Estonian-Centric Machine Translation: Data, Models, and Challenges](https://aclanthology.org/2024.eamt-1.55) (Korotkova & Fishel, EAMT 2024)
ACL