@inproceedings{caswell-etal-2025-smol,
title = "{SMOL}: Professionally Translated Parallel Data for 115 Under-represented Languages",
author = "Caswell, Isaac and
Nielsen, Elizabeth and
Luo, Jiaming and
Cherry, Colin and
Kovacs, Geza and
Shemtov, Hadar and
Talukdar, Partha and
Tewari, Dinesh and
Doumbouya, Moussa and
Diane, Djibrila and
Diane, Baba Mamadi and
Farabado, Solo and
Ferrante, Edoardo and
Guasoni, Alessandro and
Keita, Mamadou and
Debbarma, Sudhamoy and
Kuzhuget, Ali and
Anugraha, David and
Shulthan Habibi, Muhammad Ravi and
Ahmadi, Sina and
Liu, Mingfei and
Eng, Jonathan",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wmt-1.85/",
pages = "1103--1123",
ISBN = "979-8-89176-341-8",
abstract = "We open-source SMOL(Set of Maximal Over-all Leverage), a suite of training data to un-lock machine translation for low-resource languages (LRLs). SMOL has been translated into123 under-resourced languages (125 language pairs), including many for which there exist no previous public resources, for a total of 6.1M translated tokens. SMOL comprises two sub-datasets, each carefully chosen for maximum impact given its size: SMOLSENT, a set of sentences chosen for broad unique token coverage, and SMOLDOC, a document-level source focusing on a broad topic coverage. They join the already released GATITOS for a trifecta of paragraph, sentence, and token-level content. We demonstrate that using SMOL to prompt or fine-tune Large Language Models yields robust chrF improvements. In addition to translation, we provide factuality ratings and rationales for all documents in SMOLDOC, yielding the first factuality datasets for most of these languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="caswell-etal-2025-smol">
<titleInfo>
<title>SMOL: Professionally Translated Parallel Data for 115 Under-represented Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Caswell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Nielsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaming</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Cherry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geza</namePart>
<namePart type="family">Kovacs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadar</namePart>
<namePart type="family">Shemtov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Partha</namePart>
<namePart type="family">Talukdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dinesh</namePart>
<namePart type="family">Tewari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moussa</namePart>
<namePart type="family">Doumbouya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Djibrila</namePart>
<namePart type="family">Diane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baba</namePart>
<namePart type="given">Mamadi</namePart>
<namePart type="family">Diane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Solo</namePart>
<namePart type="family">Farabado</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="family">Ferrante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Guasoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamadou</namePart>
<namePart type="family">Keita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudhamoy</namePart>
<namePart type="family">Debbarma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Kuzhuget</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Anugraha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Ravi</namePart>
<namePart type="family">Shulthan Habibi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Ahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingfei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Eng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-341-8</identifier>
</relatedItem>
<abstract>We open-source SMOL(Set of Maximal Over-all Leverage), a suite of training data to un-lock machine translation for low-resource languages (LRLs). SMOL has been translated into123 under-resourced languages (125 language pairs), including many for which there exist no previous public resources, for a total of 6.1M translated tokens. SMOL comprises two sub-datasets, each carefully chosen for maximum impact given its size: SMOLSENT, a set of sentences chosen for broad unique token coverage, and SMOLDOC, a document-level source focusing on a broad topic coverage. They join the already released GATITOS for a trifecta of paragraph, sentence, and token-level content. We demonstrate that using SMOL to prompt or fine-tune Large Language Models yields robust chrF improvements. In addition to translation, we provide factuality ratings and rationales for all documents in SMOLDOC, yielding the first factuality datasets for most of these languages.</abstract>
<identifier type="citekey">caswell-etal-2025-smol</identifier>
<location>
<url>https://aclanthology.org/2025.wmt-1.85/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1103</start>
<end>1123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SMOL: Professionally Translated Parallel Data for 115 Under-represented Languages
%A Caswell, Isaac
%A Nielsen, Elizabeth
%A Luo, Jiaming
%A Cherry, Colin
%A Kovacs, Geza
%A Shemtov, Hadar
%A Talukdar, Partha
%A Tewari, Dinesh
%A Doumbouya, Moussa
%A Diane, Djibrila
%A Diane, Baba Mamadi
%A Farabado, Solo
%A Ferrante, Edoardo
%A Guasoni, Alessandro
%A Keita, Mamadou
%A Debbarma, Sudhamoy
%A Kuzhuget, Ali
%A Anugraha, David
%A Shulthan Habibi, Muhammad Ravi
%A Ahmadi, Sina
%A Liu, Mingfei
%A Eng, Jonathan
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Tenth Conference on Machine Translation
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-341-8
%F caswell-etal-2025-smol
%X We open-source SMOL(Set of Maximal Over-all Leverage), a suite of training data to un-lock machine translation for low-resource languages (LRLs). SMOL has been translated into123 under-resourced languages (125 language pairs), including many for which there exist no previous public resources, for a total of 6.1M translated tokens. SMOL comprises two sub-datasets, each carefully chosen for maximum impact given its size: SMOLSENT, a set of sentences chosen for broad unique token coverage, and SMOLDOC, a document-level source focusing on a broad topic coverage. They join the already released GATITOS for a trifecta of paragraph, sentence, and token-level content. We demonstrate that using SMOL to prompt or fine-tune Large Language Models yields robust chrF improvements. In addition to translation, we provide factuality ratings and rationales for all documents in SMOLDOC, yielding the first factuality datasets for most of these languages.
%U https://aclanthology.org/2025.wmt-1.85/
%P 1103-1123
Markdown (Informal)
[SMOL: Professionally Translated Parallel Data for 115 Under-represented Languages](https://aclanthology.org/2025.wmt-1.85/) (Caswell et al., WMT 2025)
ACL
- Isaac Caswell, Elizabeth Nielsen, Jiaming Luo, Colin Cherry, Geza Kovacs, Hadar Shemtov, Partha Talukdar, Dinesh Tewari, Moussa Doumbouya, Djibrila Diane, Baba Mamadi Diane, Solo Farabado, Edoardo Ferrante, Alessandro Guasoni, Mamadou Keita, Sudhamoy Debbarma, Ali Kuzhuget, David Anugraha, Muhammad Ravi Shulthan Habibi, Sina Ahmadi, Mingfei Liu, and Jonathan Eng. 2025. SMOL: Professionally Translated Parallel Data for 115 Under-represented Languages. In Proceedings of the Tenth Conference on Machine Translation, pages 1103–1123, Suzhou, China. Association for Computational Linguistics.