@inproceedings{thompson-etal-2026-bringing,
title = "Bringing {M}apudungun into the {M}odern {MT} Ecosystem: Morphology-Aware Tokenization for {NLLB}-200 Fine-Tuning",
author = "Thompson, Isaac and
Rogers, Brandon and
Ringger, Eric",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.americasnlp-6.16/",
pages = "173--185",
ISBN = "979-8-89176-415-6",
abstract = "For Mapudungun arn{\textrightarrow}es translation, morphology-aware tokenization can substitute for a 5{\texttimes} increase in model parameters. We fine-tune three sizes of Meta{'}s NLLB-200 on Mapudungun{--}Spanish translation across eight tokenization strategies, including our novel Morfessor-VC method, whichconstrains Morfessor morpheme segmentation to tokens already present in NLLB{'}s pretrainedvocabulary. Our 600M Morfessor-VC model is competitive with our own fine-tuned 3.3B Standard BPE model on arn{\textrightarrow}es (43.2 vs. 42.9 chrF++, {\ensuremath{\Delta}} = +0.3, p = 0.039, 95{\%} CI [0.02, 0.60]) while using five times fewer parameters, and all fine-tuned conditions surpass frontier LLMs by over 27 chrF++. Mapudungun is an indigenous polysynthetic language spoken by 200,000+ Mapuche people in Chile and Argentina, absent from NLLB-200 and not supported by major commercial MT providers; prior work predates large-scale multilingual models and does not address the tokenization challenges posed by its agglutinativemorphology. These results establish new state-of-the-art baselines for Mapudungun MT and provide a practical foundation for community language tools in pedagogy, social media, and language revitalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thompson-etal-2026-bringing">
<titleInfo>
<title>Bringing Mapudungun into the Modern MT Ecosystem: Morphology-Aware Tokenization for NLLB-200 Fine-Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brandon</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Ringger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minh</namePart>
<namePart type="given">Duc</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="given">Coto</namePart>
<namePart type="family">Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Von Der Wense</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-415-6</identifier>
</relatedItem>
<abstract>For Mapudungun arn→es translation, morphology-aware tokenization can substitute for a 5× increase in model parameters. We fine-tune three sizes of Meta’s NLLB-200 on Mapudungun–Spanish translation across eight tokenization strategies, including our novel Morfessor-VC method, whichconstrains Morfessor morpheme segmentation to tokens already present in NLLB’s pretrainedvocabulary. Our 600M Morfessor-VC model is competitive with our own fine-tuned 3.3B Standard BPE model on arn→es (43.2 vs. 42.9 chrF++, \ensuremathΔ = +0.3, p = 0.039, 95% CI [0.02, 0.60]) while using five times fewer parameters, and all fine-tuned conditions surpass frontier LLMs by over 27 chrF++. Mapudungun is an indigenous polysynthetic language spoken by 200,000+ Mapuche people in Chile and Argentina, absent from NLLB-200 and not supported by major commercial MT providers; prior work predates large-scale multilingual models and does not address the tokenization challenges posed by its agglutinativemorphology. These results establish new state-of-the-art baselines for Mapudungun MT and provide a practical foundation for community language tools in pedagogy, social media, and language revitalization.</abstract>
<identifier type="citekey">thompson-etal-2026-bringing</identifier>
<location>
<url>https://aclanthology.org/2026.americasnlp-6.16/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>173</start>
<end>185</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bringing Mapudungun into the Modern MT Ecosystem: Morphology-Aware Tokenization for NLLB-200 Fine-Tuning
%A Thompson, Isaac
%A Rogers, Brandon
%A Ringger, Eric
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Bui, Minh Duc
%Y Pugh, Robert
%Y Oncevay, Arturo
%Y Chiruzzo, Luis
%Y Solano, Rolando Coto
%Y Rijhwani, Shruti
%Y Von Der Wense, Katharina
%S Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-415-6
%F thompson-etal-2026-bringing
%X For Mapudungun arn→es translation, morphology-aware tokenization can substitute for a 5× increase in model parameters. We fine-tune three sizes of Meta’s NLLB-200 on Mapudungun–Spanish translation across eight tokenization strategies, including our novel Morfessor-VC method, whichconstrains Morfessor morpheme segmentation to tokens already present in NLLB’s pretrainedvocabulary. Our 600M Morfessor-VC model is competitive with our own fine-tuned 3.3B Standard BPE model on arn→es (43.2 vs. 42.9 chrF++, \ensuremathΔ = +0.3, p = 0.039, 95% CI [0.02, 0.60]) while using five times fewer parameters, and all fine-tuned conditions surpass frontier LLMs by over 27 chrF++. Mapudungun is an indigenous polysynthetic language spoken by 200,000+ Mapuche people in Chile and Argentina, absent from NLLB-200 and not supported by major commercial MT providers; prior work predates large-scale multilingual models and does not address the tokenization challenges posed by its agglutinativemorphology. These results establish new state-of-the-art baselines for Mapudungun MT and provide a practical foundation for community language tools in pedagogy, social media, and language revitalization.
%U https://aclanthology.org/2026.americasnlp-6.16/
%P 173-185
Markdown (Informal)
[Bringing Mapudungun into the Modern MT Ecosystem: Morphology-Aware Tokenization for NLLB-200 Fine-Tuning](https://aclanthology.org/2026.americasnlp-6.16/) (Thompson et al., AmericasNLP 2026)
ACL