@inproceedings{asillo-congora-etal-2025-ucsp,
title = "{UCSP} Submission to the {A}mericas{NLP} 2025 Shared Task",
author = "Asillo Congora, Jorge and
Santisteban, Julio and
Lazo Vasquez, Ricardo",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Pugh, Robert and
Rijhwani, Shruti and
Von Der Wense, Katharina and
Chiruzzo, Luis and
Coto-Solano, Rolando and
Oncevay, Arturo",
booktitle = "Proceedings of the Fifth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.americasnlp-1.9/",
doi = "10.18653/v1/2025.americasnlp-1.9",
pages = "84--91",
ISBN = "979-8-89176-236-7",
abstract = "Quechua is a low-resource language spoken by more than 7 million people in South America. While Quechua is primarily an oral language, several orthographic standards do exist. There is no universally adopted writing standard for Quechua, and variations exist across dialects and regions; its current writing is based on how it is uttered and how the sound is written. Quechua is a family of languages with similarities among the seven variants. The lack of a parallel dataset has reduced the opportunities for developing machine translation. We investigated whether increasing the current Quechua Parallel dataset with synthetic sentences and using a pre-trained large language model improves the performance of a Quechua machine translation. A Large language model has been used to generate synthetic sentences to extend the current parallel dataset. We use the mt5 model to fine-tune it to develop a machine translation for Quechua to Spanish and vice versa. Our survey identified the gaps in the state of the art of Quechua machine translation, and our BLEU/Chrf++ results show an improvement over the state of the art."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="asillo-congora-etal-2025-ucsp">
<titleInfo>
<title>UCSP Submission to the AmericasNLP 2025 Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Asillo Congora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julio</namePart>
<namePart type="family">Santisteban</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Lazo Vasquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Von Der Wense</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="family">Coto-Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-236-7</identifier>
</relatedItem>
<abstract>Quechua is a low-resource language spoken by more than 7 million people in South America. While Quechua is primarily an oral language, several orthographic standards do exist. There is no universally adopted writing standard for Quechua, and variations exist across dialects and regions; its current writing is based on how it is uttered and how the sound is written. Quechua is a family of languages with similarities among the seven variants. The lack of a parallel dataset has reduced the opportunities for developing machine translation. We investigated whether increasing the current Quechua Parallel dataset with synthetic sentences and using a pre-trained large language model improves the performance of a Quechua machine translation. A Large language model has been used to generate synthetic sentences to extend the current parallel dataset. We use the mt5 model to fine-tune it to develop a machine translation for Quechua to Spanish and vice versa. Our survey identified the gaps in the state of the art of Quechua machine translation, and our BLEU/Chrf++ results show an improvement over the state of the art.</abstract>
<identifier type="citekey">asillo-congora-etal-2025-ucsp</identifier>
<identifier type="doi">10.18653/v1/2025.americasnlp-1.9</identifier>
<location>
<url>https://aclanthology.org/2025.americasnlp-1.9/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>84</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UCSP Submission to the AmericasNLP 2025 Shared Task
%A Asillo Congora, Jorge
%A Santisteban, Julio
%A Lazo Vasquez, Ricardo
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Pugh, Robert
%Y Rijhwani, Shruti
%Y Von Der Wense, Katharina
%Y Chiruzzo, Luis
%Y Coto-Solano, Rolando
%Y Oncevay, Arturo
%S Proceedings of the Fifth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-236-7
%F asillo-congora-etal-2025-ucsp
%X Quechua is a low-resource language spoken by more than 7 million people in South America. While Quechua is primarily an oral language, several orthographic standards do exist. There is no universally adopted writing standard for Quechua, and variations exist across dialects and regions; its current writing is based on how it is uttered and how the sound is written. Quechua is a family of languages with similarities among the seven variants. The lack of a parallel dataset has reduced the opportunities for developing machine translation. We investigated whether increasing the current Quechua Parallel dataset with synthetic sentences and using a pre-trained large language model improves the performance of a Quechua machine translation. A Large language model has been used to generate synthetic sentences to extend the current parallel dataset. We use the mt5 model to fine-tune it to develop a machine translation for Quechua to Spanish and vice versa. Our survey identified the gaps in the state of the art of Quechua machine translation, and our BLEU/Chrf++ results show an improvement over the state of the art.
%R 10.18653/v1/2025.americasnlp-1.9
%U https://aclanthology.org/2025.americasnlp-1.9/
%U https://doi.org/10.18653/v1/2025.americasnlp-1.9
%P 84-91
Markdown (Informal)
[UCSP Submission to the AmericasNLP 2025 Shared Task](https://aclanthology.org/2025.americasnlp-1.9/) (Asillo Congora et al., AmericasNLP 2025)
ACL
- Jorge Asillo Congora, Julio Santisteban, and Ricardo Lazo Vasquez. 2025. UCSP Submission to the AmericasNLP 2025 Shared Task. In Proceedings of the Fifth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP), pages 84–91, Albuquerque, New Mexico. Association for Computational Linguistics.