@inproceedings{olaleye-etal-2025-afrocs,
title = "{A}fro{CS}-xs: Creating a Compact, High-Quality, Human-Validated Code-Switched Dataset for {A}frican Languages",
author = "Olaleye, Kayode and
Oncevay, Arturo and
Sibue, Mathieu and
Zondi, Nombuyiselo and
Terblanche, Michelle and
Mapikitla, Sibongile and
Lastrucci, Richard and
Smiley, Charese and
Marivate, Vukosi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1601/",
doi = "10.18653/v1/2025.acl-long.1601",
pages = "33391--33410",
ISBN = "979-8-89176-251-0",
abstract = "Code-switching is prevalent in multilingual communities but lacks adequate high-quality data for model development, especially for African languages. To address this, we present AfroCS-xs, a small human-validated synthetic code-switched dataset for four African languages (Afrikaans, Sesotho, Yoruba, isiZulu) and English within a specific domain{---}agriculture. Using large language models (LLMs), we generate code-switched sentences, including English translations, that are rigorously validated and corrected by native speakers. As a downstream evaluation task, we use this dataset to fine-tune different instruction-tuned LLMs for code-switched translation and compare their performance against machine translation (MT) models. Our results demonstrate that LLMs consistently improve in translation accuracy when fine-tuned on the high-quality AfroCS-xs dataset, highlighting that substantial gains can still be made with a low volume of data. We also observe improvements on natural code-switched and out-of-domain (personal finance) test sets. Overall, regardless of data size and prior exposure to a language, LLMs benefit from higher quality training data when translating code-switched texts in under-represented languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="olaleye-etal-2025-afrocs">
<titleInfo>
<title>AfroCS-xs: Creating a Compact, High-Quality, Human-Validated Code-Switched Dataset for African Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kayode</namePart>
<namePart type="family">Olaleye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Sibue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nombuyiselo</namePart>
<namePart type="family">Zondi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Terblanche</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sibongile</namePart>
<namePart type="family">Mapikitla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Lastrucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charese</namePart>
<namePart type="family">Smiley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vukosi</namePart>
<namePart type="family">Marivate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Code-switching is prevalent in multilingual communities but lacks adequate high-quality data for model development, especially for African languages. To address this, we present AfroCS-xs, a small human-validated synthetic code-switched dataset for four African languages (Afrikaans, Sesotho, Yoruba, isiZulu) and English within a specific domain—agriculture. Using large language models (LLMs), we generate code-switched sentences, including English translations, that are rigorously validated and corrected by native speakers. As a downstream evaluation task, we use this dataset to fine-tune different instruction-tuned LLMs for code-switched translation and compare their performance against machine translation (MT) models. Our results demonstrate that LLMs consistently improve in translation accuracy when fine-tuned on the high-quality AfroCS-xs dataset, highlighting that substantial gains can still be made with a low volume of data. We also observe improvements on natural code-switched and out-of-domain (personal finance) test sets. Overall, regardless of data size and prior exposure to a language, LLMs benefit from higher quality training data when translating code-switched texts in under-represented languages.</abstract>
<identifier type="citekey">olaleye-etal-2025-afrocs</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1601</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1601/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>33391</start>
<end>33410</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AfroCS-xs: Creating a Compact, High-Quality, Human-Validated Code-Switched Dataset for African Languages
%A Olaleye, Kayode
%A Oncevay, Arturo
%A Sibue, Mathieu
%A Zondi, Nombuyiselo
%A Terblanche, Michelle
%A Mapikitla, Sibongile
%A Lastrucci, Richard
%A Smiley, Charese
%A Marivate, Vukosi
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F olaleye-etal-2025-afrocs
%X Code-switching is prevalent in multilingual communities but lacks adequate high-quality data for model development, especially for African languages. To address this, we present AfroCS-xs, a small human-validated synthetic code-switched dataset for four African languages (Afrikaans, Sesotho, Yoruba, isiZulu) and English within a specific domain—agriculture. Using large language models (LLMs), we generate code-switched sentences, including English translations, that are rigorously validated and corrected by native speakers. As a downstream evaluation task, we use this dataset to fine-tune different instruction-tuned LLMs for code-switched translation and compare their performance against machine translation (MT) models. Our results demonstrate that LLMs consistently improve in translation accuracy when fine-tuned on the high-quality AfroCS-xs dataset, highlighting that substantial gains can still be made with a low volume of data. We also observe improvements on natural code-switched and out-of-domain (personal finance) test sets. Overall, regardless of data size and prior exposure to a language, LLMs benefit from higher quality training data when translating code-switched texts in under-represented languages.
%R 10.18653/v1/2025.acl-long.1601
%U https://aclanthology.org/2025.acl-long.1601/
%U https://doi.org/10.18653/v1/2025.acl-long.1601
%P 33391-33410
Markdown (Informal)
[AfroCS-xs: Creating a Compact, High-Quality, Human-Validated Code-Switched Dataset for African Languages](https://aclanthology.org/2025.acl-long.1601/) (Olaleye et al., ACL 2025)
ACL
- Kayode Olaleye, Arturo Oncevay, Mathieu Sibue, Nombuyiselo Zondi, Michelle Terblanche, Sibongile Mapikitla, Richard Lastrucci, Charese Smiley, and Vukosi Marivate. 2025. AfroCS-xs: Creating a Compact, High-Quality, Human-Validated Code-Switched Dataset for African Languages. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 33391–33410, Vienna, Austria. Association for Computational Linguistics.