@inproceedings{cotik-etal-2026-qomlaqtaqa,
title = "{Q}om{L}{'}aqtaqa: A {Q}om{--}{S}panish Parallel Corpus for Natural Language Processing with Machine Translation Evaluation",
author = "Cotik, Viviana and
Korablev, Aleksei and
C{\'u}neo, Paola and
Laciana, Pablo",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.americasnlp-6.17/",
pages = "186--202",
ISBN = "979-8-89176-415-6",
abstract = "Qom, a language of the Guaycuruan family, is a low-resource language for NLP and speech processing. We present the first parallel Qom{--}Spanish corpus in a computationally usable format, comprising 33,392 parallel segments, totaling 1,469,905 Qom tokens and 891,344 Spanish tokens. A subset of 2,943 segments excludes Bible-derived content. It includes alignments at different levels: sentences, sentence fragments, and paragraphs, and is compiled from multiple sources, both previously available and newly collected. We also present bidirectional neural machine translation baselines based on NLLB-200, achieving competitive performance in both translation directions on the full dataset, and lower performance on the non-Bible subset. An ablation study shows that training exclusively on biblical data reduces performance on non-biblical text, highlighting the importance of domain diversity in low-resource machine translation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cotik-etal-2026-qomlaqtaqa">
<titleInfo>
<title>QomL’aqtaqa: A Qom–Spanish Parallel Corpus for Natural Language Processing with Machine Translation Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Viviana</namePart>
<namePart type="family">Cotik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksei</namePart>
<namePart type="family">Korablev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Cúneo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pablo</namePart>
<namePart type="family">Laciana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minh</namePart>
<namePart type="given">Duc</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="given">Coto</namePart>
<namePart type="family">Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Von Der Wense</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-415-6</identifier>
</relatedItem>
<abstract>Qom, a language of the Guaycuruan family, is a low-resource language for NLP and speech processing. We present the first parallel Qom–Spanish corpus in a computationally usable format, comprising 33,392 parallel segments, totaling 1,469,905 Qom tokens and 891,344 Spanish tokens. A subset of 2,943 segments excludes Bible-derived content. It includes alignments at different levels: sentences, sentence fragments, and paragraphs, and is compiled from multiple sources, both previously available and newly collected. We also present bidirectional neural machine translation baselines based on NLLB-200, achieving competitive performance in both translation directions on the full dataset, and lower performance on the non-Bible subset. An ablation study shows that training exclusively on biblical data reduces performance on non-biblical text, highlighting the importance of domain diversity in low-resource machine translation.</abstract>
<identifier type="citekey">cotik-etal-2026-qomlaqtaqa</identifier>
<location>
<url>https://aclanthology.org/2026.americasnlp-6.17/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>186</start>
<end>202</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QomL’aqtaqa: A Qom–Spanish Parallel Corpus for Natural Language Processing with Machine Translation Evaluation
%A Cotik, Viviana
%A Korablev, Aleksei
%A Cúneo, Paola
%A Laciana, Pablo
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Bui, Minh Duc
%Y Pugh, Robert
%Y Oncevay, Arturo
%Y Chiruzzo, Luis
%Y Solano, Rolando Coto
%Y Rijhwani, Shruti
%Y Von Der Wense, Katharina
%S Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-415-6
%F cotik-etal-2026-qomlaqtaqa
%X Qom, a language of the Guaycuruan family, is a low-resource language for NLP and speech processing. We present the first parallel Qom–Spanish corpus in a computationally usable format, comprising 33,392 parallel segments, totaling 1,469,905 Qom tokens and 891,344 Spanish tokens. A subset of 2,943 segments excludes Bible-derived content. It includes alignments at different levels: sentences, sentence fragments, and paragraphs, and is compiled from multiple sources, both previously available and newly collected. We also present bidirectional neural machine translation baselines based on NLLB-200, achieving competitive performance in both translation directions on the full dataset, and lower performance on the non-Bible subset. An ablation study shows that training exclusively on biblical data reduces performance on non-biblical text, highlighting the importance of domain diversity in low-resource machine translation.
%U https://aclanthology.org/2026.americasnlp-6.17/
%P 186-202
Markdown (Informal)
[QomL’aqtaqa: A Qom–Spanish Parallel Corpus for Natural Language Processing with Machine Translation Evaluation](https://aclanthology.org/2026.americasnlp-6.17/) (Cotik et al., AmericasNLP 2026)
ACL