@inproceedings{merx-etal-2024-low,
title = "Low-Resource Machine Translation through Retrieval-Augmented {LLM} Prompting: A Study on the {M}ambai Language",
author = {Merx, Rapha{\"e}l and
Mahmudi, Aso and
Langford, Katrina and
de Araujo, Leo Alberto and
Vylomova, Ekaterina},
editor = "Ojha, Atul Kr. and
Ahmadi, Sina and
Cinkov{\'a}, Silvie and
Fransen, Theodorus and
Liu, Chao-Hong and
McCrae, John P.",
booktitle = "Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.eurali-1.1",
pages = "1--11",
abstract = "This study explores the use of large language models (LLMs) for translating English into Mambai, a low-resource Austronesian language spoken in Timor-Leste, with approximately 200,000 native speakers. Leveraging a novel corpus derived from a Mambai language manual and additional sentences translated by a native speaker, we examine the efficacy of few-shot LLM prompting for machine translation (MT) in this low-resource context. Our methodology involves the strategic selection of parallel sentences and dictionary entries for prompting, aiming to enhance translation accuracy, using open-source and proprietary LLMs (LlaMa 2 70b, Mixtral 8x7B, GPT-4). We find that including dictionary entries in prompts and a mix of sentences retrieved through TF-IDF and semantic embeddings significantly improves translation quality. However, translation accuracy varies between test sets, highlighting the importance of diverse corpora for evaluating low-resource MT. This research provides insights into few-shot LLM prompting for low-resource MT, and makes available an initial corpus for the Mambai language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="merx-etal-2024-low">
<titleInfo>
<title>Low-Resource Machine Translation through Retrieval-Augmented LLM Prompting: A Study on the Mambai Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raphaël</namePart>
<namePart type="family">Merx</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aso</namePart>
<namePart type="family">Mahmudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katrina</namePart>
<namePart type="family">Langford</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="given">Alberto</namePart>
<namePart type="family">de Araujo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Ahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Silvie</namePart>
<namePart type="family">Cinková</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Theodorus</namePart>
<namePart type="family">Fransen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-Hong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">P</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study explores the use of large language models (LLMs) for translating English into Mambai, a low-resource Austronesian language spoken in Timor-Leste, with approximately 200,000 native speakers. Leveraging a novel corpus derived from a Mambai language manual and additional sentences translated by a native speaker, we examine the efficacy of few-shot LLM prompting for machine translation (MT) in this low-resource context. Our methodology involves the strategic selection of parallel sentences and dictionary entries for prompting, aiming to enhance translation accuracy, using open-source and proprietary LLMs (LlaMa 2 70b, Mixtral 8x7B, GPT-4). We find that including dictionary entries in prompts and a mix of sentences retrieved through TF-IDF and semantic embeddings significantly improves translation quality. However, translation accuracy varies between test sets, highlighting the importance of diverse corpora for evaluating low-resource MT. This research provides insights into few-shot LLM prompting for low-resource MT, and makes available an initial corpus for the Mambai language.</abstract>
<identifier type="citekey">merx-etal-2024-low</identifier>
<location>
<url>https://aclanthology.org/2024.eurali-1.1</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-Resource Machine Translation through Retrieval-Augmented LLM Prompting: A Study on the Mambai Language
%A Merx, Raphaël
%A Mahmudi, Aso
%A Langford, Katrina
%A de Araujo, Leo Alberto
%A Vylomova, Ekaterina
%Y Ojha, Atul Kr.
%Y Ahmadi, Sina
%Y Cinková, Silvie
%Y Fransen, Theodorus
%Y Liu, Chao-Hong
%Y McCrae, John P.
%S Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F merx-etal-2024-low
%X This study explores the use of large language models (LLMs) for translating English into Mambai, a low-resource Austronesian language spoken in Timor-Leste, with approximately 200,000 native speakers. Leveraging a novel corpus derived from a Mambai language manual and additional sentences translated by a native speaker, we examine the efficacy of few-shot LLM prompting for machine translation (MT) in this low-resource context. Our methodology involves the strategic selection of parallel sentences and dictionary entries for prompting, aiming to enhance translation accuracy, using open-source and proprietary LLMs (LlaMa 2 70b, Mixtral 8x7B, GPT-4). We find that including dictionary entries in prompts and a mix of sentences retrieved through TF-IDF and semantic embeddings significantly improves translation quality. However, translation accuracy varies between test sets, highlighting the importance of diverse corpora for evaluating low-resource MT. This research provides insights into few-shot LLM prompting for low-resource MT, and makes available an initial corpus for the Mambai language.
%U https://aclanthology.org/2024.eurali-1.1
%P 1-11
Markdown (Informal)
[Low-Resource Machine Translation through Retrieval-Augmented LLM Prompting: A Study on the Mambai Language](https://aclanthology.org/2024.eurali-1.1) (Merx et al., EURALI-WS 2024)
ACL