@inproceedings{merx-etal-2025-low,
title = "Low-resource Machine Translation: what for? who for? An observational study on a dedicated Tetun language translation service",
author = "Merx, Raphael and
Correia, Ad{\'e}rito Jos{\'e} Guterres and
Suominen, Hanna and
Vylomova, Ekaterina",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Washington, Jonathan and
Oco, Nathaniel and
Zhao, Xiaobing",
booktitle = "Proceedings of the Eighth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, U.S.A.",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.loresmt-1.7/",
doi = "10.18653/v1/2025.loresmt-1.7",
pages = "54--65",
ISBN = "979-8-89176-230-5",
abstract = "Low-resource machine translation (MT) presents a diversity of community needs and application challenges that remain poorly understood. To complement surveys and focus groups, which tend to rely on small samples of respondents, we propose an observational study on actual usage patterns of a specialized MT service for the Tetun language, which is the lingua franca in Timor-Leste. Our analysis of 100,000 translation requests reveals patterns that challenge assumptions based on existing corpora. We find that users, many of them students on mobile devices, typically translate text from a high-resource language into Tetun across diverse domains including science, healthcare, and daily life. This contrasts sharply with available Tetun corpora, which are dominated by news articles covering government and social issues.Our results suggest that MT systems for institutionalized minority languages like Tetun should prioritize accuracy on domains relevant to educational contexts, in the high-resource to low-resource direction. More broadly, this study demonstrates how observational analysis can inform low-resource language technology development, by grounding research in practical community needs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="merx-etal-2025-low">
<titleInfo>
<title>Low-resource Machine Translation: what for? who for? An observational study on a dedicated Tetun language translation service</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Merx</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adérito</namePart>
<namePart type="given">José</namePart>
<namePart type="given">Guterres</namePart>
<namePart type="family">Correia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanna</namePart>
<namePart type="family">Suominen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-hong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Washington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathaniel</namePart>
<namePart type="family">Oco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaobing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, U.S.A.</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-230-5</identifier>
</relatedItem>
<abstract>Low-resource machine translation (MT) presents a diversity of community needs and application challenges that remain poorly understood. To complement surveys and focus groups, which tend to rely on small samples of respondents, we propose an observational study on actual usage patterns of a specialized MT service for the Tetun language, which is the lingua franca in Timor-Leste. Our analysis of 100,000 translation requests reveals patterns that challenge assumptions based on existing corpora. We find that users, many of them students on mobile devices, typically translate text from a high-resource language into Tetun across diverse domains including science, healthcare, and daily life. This contrasts sharply with available Tetun corpora, which are dominated by news articles covering government and social issues.Our results suggest that MT systems for institutionalized minority languages like Tetun should prioritize accuracy on domains relevant to educational contexts, in the high-resource to low-resource direction. More broadly, this study demonstrates how observational analysis can inform low-resource language technology development, by grounding research in practical community needs.</abstract>
<identifier type="citekey">merx-etal-2025-low</identifier>
<identifier type="doi">10.18653/v1/2025.loresmt-1.7</identifier>
<location>
<url>https://aclanthology.org/2025.loresmt-1.7/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>54</start>
<end>65</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-resource Machine Translation: what for? who for? An observational study on a dedicated Tetun language translation service
%A Merx, Raphael
%A Correia, Adérito José Guterres
%A Suominen, Hanna
%A Vylomova, Ekaterina
%Y Ojha, Atul Kr.
%Y Liu, Chao-hong
%Y Vylomova, Ekaterina
%Y Pirinen, Flammie
%Y Washington, Jonathan
%Y Oco, Nathaniel
%Y Zhao, Xiaobing
%S Proceedings of the Eighth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, U.S.A.
%@ 979-8-89176-230-5
%F merx-etal-2025-low
%X Low-resource machine translation (MT) presents a diversity of community needs and application challenges that remain poorly understood. To complement surveys and focus groups, which tend to rely on small samples of respondents, we propose an observational study on actual usage patterns of a specialized MT service for the Tetun language, which is the lingua franca in Timor-Leste. Our analysis of 100,000 translation requests reveals patterns that challenge assumptions based on existing corpora. We find that users, many of them students on mobile devices, typically translate text from a high-resource language into Tetun across diverse domains including science, healthcare, and daily life. This contrasts sharply with available Tetun corpora, which are dominated by news articles covering government and social issues.Our results suggest that MT systems for institutionalized minority languages like Tetun should prioritize accuracy on domains relevant to educational contexts, in the high-resource to low-resource direction. More broadly, this study demonstrates how observational analysis can inform low-resource language technology development, by grounding research in practical community needs.
%R 10.18653/v1/2025.loresmt-1.7
%U https://aclanthology.org/2025.loresmt-1.7/
%U https://doi.org/10.18653/v1/2025.loresmt-1.7
%P 54-65
Markdown (Informal)
[Low-resource Machine Translation: what for? who for? An observational study on a dedicated Tetun language translation service](https://aclanthology.org/2025.loresmt-1.7/) (Merx et al., LoResMT 2025)
ACL