@inproceedings{begoli-etal-2024-rosetta,
title = "Rosetta Balcanica: Deriving a {\textquotedblleft}Gold Standard{\textquotedblright} Neural Machine Translation ({NMT}) Parallel Dataset from High-Fidelity Resources for {W}estern {B}alkan Languages",
author = "Begoli, Edmon and
Mahbub, Maria and
Srinivasan, Sudarshan",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Abbott, Jade and
Washington, Jonathan and
Oco, Nathaniel and
Malykh, Valentin and
Logacheva, Varvara and
Zhao, Xiaobing",
booktitle = "Proceedings of the Seventh Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-1.19/",
doi = "10.18653/v1/2024.loresmt-1.19",
pages = "186--192",
abstract = "The Rosetta Balcanica is an ongoing effort in resource expansion for low-resource Western Balkans languages. This effort focuses on discovering and using accurately translated, officially mapped, and curated parallel language resources and their preparation and use as neural machine translation (NMT) datasets. Some of the guiding principles, practices, and methods employed by Rosetta Balcanica are generalizable and could apply to other low-resource language resource expansion efforts. With this goal in mind, we present our rationale and approach to discovering and using meticulously translated and officially curated low-resource language resources and our use of these resources to develop a parallel {\textquotedblleft}gold standard{\textquotedblright} translation training resource. Secondly, we describe our specific methodology for NMT dataset development from these resources and its publication to a widely-used and accessible repository for natural language processing (\textit{Hugging Face Hub}). Finally, we discuss the trade-offs and limitations of our current approach, and the roadmap for future development and the expansion of the current Rosetta Balcanica language resource."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="begoli-etal-2024-rosetta">
<titleInfo>
<title>Rosetta Balcanica: Deriving a “Gold Standard” Neural Machine Translation (NMT) Parallel Dataset from High-Fidelity Resources for Western Balkan Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Edmon</namePart>
<namePart type="family">Begoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Mahbub</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudarshan</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-hong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jade</namePart>
<namePart type="family">Abbott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Washington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathaniel</namePart>
<namePart type="family">Oco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varvara</namePart>
<namePart type="family">Logacheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaobing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Rosetta Balcanica is an ongoing effort in resource expansion for low-resource Western Balkans languages. This effort focuses on discovering and using accurately translated, officially mapped, and curated parallel language resources and their preparation and use as neural machine translation (NMT) datasets. Some of the guiding principles, practices, and methods employed by Rosetta Balcanica are generalizable and could apply to other low-resource language resource expansion efforts. With this goal in mind, we present our rationale and approach to discovering and using meticulously translated and officially curated low-resource language resources and our use of these resources to develop a parallel “gold standard” translation training resource. Secondly, we describe our specific methodology for NMT dataset development from these resources and its publication to a widely-used and accessible repository for natural language processing (Hugging Face Hub). Finally, we discuss the trade-offs and limitations of our current approach, and the roadmap for future development and the expansion of the current Rosetta Balcanica language resource.</abstract>
<identifier type="citekey">begoli-etal-2024-rosetta</identifier>
<identifier type="doi">10.18653/v1/2024.loresmt-1.19</identifier>
<location>
<url>https://aclanthology.org/2024.acl-1.19/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>186</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rosetta Balcanica: Deriving a “Gold Standard” Neural Machine Translation (NMT) Parallel Dataset from High-Fidelity Resources for Western Balkan Languages
%A Begoli, Edmon
%A Mahbub, Maria
%A Srinivasan, Sudarshan
%Y Ojha, Atul Kr.
%Y Liu, Chao-hong
%Y Vylomova, Ekaterina
%Y Pirinen, Flammie
%Y Abbott, Jade
%Y Washington, Jonathan
%Y Oco, Nathaniel
%Y Malykh, Valentin
%Y Logacheva, Varvara
%Y Zhao, Xiaobing
%S Proceedings of the Seventh Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F begoli-etal-2024-rosetta
%X The Rosetta Balcanica is an ongoing effort in resource expansion for low-resource Western Balkans languages. This effort focuses on discovering and using accurately translated, officially mapped, and curated parallel language resources and their preparation and use as neural machine translation (NMT) datasets. Some of the guiding principles, practices, and methods employed by Rosetta Balcanica are generalizable and could apply to other low-resource language resource expansion efforts. With this goal in mind, we present our rationale and approach to discovering and using meticulously translated and officially curated low-resource language resources and our use of these resources to develop a parallel “gold standard” translation training resource. Secondly, we describe our specific methodology for NMT dataset development from these resources and its publication to a widely-used and accessible repository for natural language processing (Hugging Face Hub). Finally, we discuss the trade-offs and limitations of our current approach, and the roadmap for future development and the expansion of the current Rosetta Balcanica language resource.
%R 10.18653/v1/2024.loresmt-1.19
%U https://aclanthology.org/2024.acl-1.19/
%U https://doi.org/10.18653/v1/2024.loresmt-1.19
%P 186-192
Markdown (Informal)
[Rosetta Balcanica: Deriving a “Gold Standard” Neural Machine Translation (NMT) Parallel Dataset from High-Fidelity Resources for Western Balkan Languages](https://aclanthology.org/2024.acl-1.19/) (Begoli et al., LoResMT 2024)
ACL