@inproceedings{khiu-etal-2024-predicting,
title = "Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity",
author = {Khiu, Eric and
Toossi, Hasti and
Anugraha, David and
Liu, Jinyu and
Li, Jiaxu and
Flores, Juan and
Roman, Leandro and
Do{\u{g}}ru{\"o}z, A. Seza and
Lee, En-Shiun},
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.100",
pages = "1474--1486",
abstract = "Fine-tuning and testing a multilingual large language model is a challenge for low-resource languages (LRLs) since it is an expensive process. While previous studies have predicted the performance of natural language processing (NLP) tasks using machine learning methods, they primarily focus on high-resource languages, overlooking LRLs and shifts across domains. Focusing on LRLs, we investigate three factors (the size of the fine-tuning corpus, domain similarity between fine-tuning and testing corpora, and language similarity between source and target languages), which can potentially impact the model performance by using classical regression models. Our results indicate that domain similarity has the most important impact on predicting the performance of Machine Translation models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khiu-etal-2024-predicting">
<titleInfo>
<title>Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Khiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hasti</namePart>
<namePart type="family">Toossi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Anugraha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leandro</namePart>
<namePart type="family">Roman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">En-Shiun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Fine-tuning and testing a multilingual large language model is a challenge for low-resource languages (LRLs) since it is an expensive process. While previous studies have predicted the performance of natural language processing (NLP) tasks using machine learning methods, they primarily focus on high-resource languages, overlooking LRLs and shifts across domains. Focusing on LRLs, we investigate three factors (the size of the fine-tuning corpus, domain similarity between fine-tuning and testing corpora, and language similarity between source and target languages), which can potentially impact the model performance by using classical regression models. Our results indicate that domain similarity has the most important impact on predicting the performance of Machine Translation models.</abstract>
<identifier type="citekey">khiu-etal-2024-predicting</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.100</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>1474</start>
<end>1486</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity
%A Khiu, Eric
%A Toossi, Hasti
%A Anugraha, David
%A Liu, Jinyu
%A Li, Jiaxu
%A Flores, Juan
%A Roman, Leandro
%A Doğruöz, A. Seza
%A Lee, En-Shiun
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F khiu-etal-2024-predicting
%X Fine-tuning and testing a multilingual large language model is a challenge for low-resource languages (LRLs) since it is an expensive process. While previous studies have predicted the performance of natural language processing (NLP) tasks using machine learning methods, they primarily focus on high-resource languages, overlooking LRLs and shifts across domains. Focusing on LRLs, we investigate three factors (the size of the fine-tuning corpus, domain similarity between fine-tuning and testing corpora, and language similarity between source and target languages), which can potentially impact the model performance by using classical regression models. Our results indicate that domain similarity has the most important impact on predicting the performance of Machine Translation models.
%U https://aclanthology.org/2024.findings-eacl.100
%P 1474-1486
Markdown (Informal)
[Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity](https://aclanthology.org/2024.findings-eacl.100) (Khiu et al., Findings 2024)
ACL
- Eric Khiu, Hasti Toossi, David Anugraha, Jinyu Liu, Jiaxu Li, Juan Flores, Leandro Roman, A. Seza Doğruöz, and En-Shiun Lee. 2024. Predicting Machine Translation Performance on Low-Resource Languages: The Role of Domain Similarity. In Findings of the Association for Computational Linguistics: EACL 2024, pages 1474–1486, St. Julian’s, Malta. Association for Computational Linguistics.