@inproceedings{nguefack-etal-2025-pretraining,
title = "Pretraining Strategies using Monolingual and Parallel Data for Low-Resource Machine Translation",
author = "Nguefack, Idriss Nguepi and
Finkelstein, Mara and
Sakayo, Toadoum Sari",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.africanlp-1.6/",
doi = "10.18653/v1/2025.africanlp-1.6",
pages = "31--38",
ISBN = "979-8-89176-257-2",
abstract = "This research article examines the effectiveness of various pretraining strategies for developing machine translation models tailored to low-resource languages. Although this work considers several low-resource languages, including Afrikaans, Swahili, and Zulu, the translation model is specifically developed for Lingala, an under-resourced African language, building upon the pretraining approach introduced byReid and Artetxe (2021), originally designed for high-resource languages. Through a series of comprehensive experiments, we explore different pretraining methodologies, including the integration of multiple languages and the use of both monolingual and parallel data during the pretraining phase. Our findings indicate that pretraining on multiple languages and leveraging both monolingual and parallel data significantly enhance translation quality. This study offers valuable insights into effective pretraining strategies for low-resource machine translation, helping to bridge the performance gap between high-resource and low-resource languages. The results contribute to the broader goal of developing more inclusive and accurate NLP models for marginalized communities and underrepresented populations. The code and datasets used in this study are publicly available to facilitate further research and ensure reproducibility, with the exception of certain data that may no longer be accessible due to changes in public availability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguefack-etal-2025-pretraining">
<titleInfo>
<title>Pretraining Strategies using Monolingual and Parallel Data for Low-Resource Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Idriss</namePart>
<namePart type="given">Nguepi</namePart>
<namePart type="family">Nguefack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mara</namePart>
<namePart type="family">Finkelstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Toadoum</namePart>
<namePart type="given">Sari</namePart>
<namePart type="family">Sakayo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-257-2</identifier>
</relatedItem>
<abstract>This research article examines the effectiveness of various pretraining strategies for developing machine translation models tailored to low-resource languages. Although this work considers several low-resource languages, including Afrikaans, Swahili, and Zulu, the translation model is specifically developed for Lingala, an under-resourced African language, building upon the pretraining approach introduced byReid and Artetxe (2021), originally designed for high-resource languages. Through a series of comprehensive experiments, we explore different pretraining methodologies, including the integration of multiple languages and the use of both monolingual and parallel data during the pretraining phase. Our findings indicate that pretraining on multiple languages and leveraging both monolingual and parallel data significantly enhance translation quality. This study offers valuable insights into effective pretraining strategies for low-resource machine translation, helping to bridge the performance gap between high-resource and low-resource languages. The results contribute to the broader goal of developing more inclusive and accurate NLP models for marginalized communities and underrepresented populations. The code and datasets used in this study are publicly available to facilitate further research and ensure reproducibility, with the exception of certain data that may no longer be accessible due to changes in public availability.</abstract>
<identifier type="citekey">nguefack-etal-2025-pretraining</identifier>
<identifier type="doi">10.18653/v1/2025.africanlp-1.6</identifier>
<location>
<url>https://aclanthology.org/2025.africanlp-1.6/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>31</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pretraining Strategies using Monolingual and Parallel Data for Low-Resource Machine Translation
%A Nguefack, Idriss Nguepi
%A Finkelstein, Mara
%A Sakayo, Toadoum Sari
%Y Lignos, Constantine
%Y Abdulmumin, Idris
%Y Adelani, David
%S Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-257-2
%F nguefack-etal-2025-pretraining
%X This research article examines the effectiveness of various pretraining strategies for developing machine translation models tailored to low-resource languages. Although this work considers several low-resource languages, including Afrikaans, Swahili, and Zulu, the translation model is specifically developed for Lingala, an under-resourced African language, building upon the pretraining approach introduced byReid and Artetxe (2021), originally designed for high-resource languages. Through a series of comprehensive experiments, we explore different pretraining methodologies, including the integration of multiple languages and the use of both monolingual and parallel data during the pretraining phase. Our findings indicate that pretraining on multiple languages and leveraging both monolingual and parallel data significantly enhance translation quality. This study offers valuable insights into effective pretraining strategies for low-resource machine translation, helping to bridge the performance gap between high-resource and low-resource languages. The results contribute to the broader goal of developing more inclusive and accurate NLP models for marginalized communities and underrepresented populations. The code and datasets used in this study are publicly available to facilitate further research and ensure reproducibility, with the exception of certain data that may no longer be accessible due to changes in public availability.
%R 10.18653/v1/2025.africanlp-1.6
%U https://aclanthology.org/2025.africanlp-1.6/
%U https://doi.org/10.18653/v1/2025.africanlp-1.6
%P 31-38
Markdown (Informal)
[Pretraining Strategies using Monolingual and Parallel Data for Low-Resource Machine Translation](https://aclanthology.org/2025.africanlp-1.6/) (Nguefack et al., AfricaNLP 2025)
ACL