@inproceedings{rajab-etal-2025-esethu,
title = "The Esethu Framework: Reimagining Sustainable Dataset Governance and Curation for Low-Resource Languages",
author = "Rajab, Jenalea and
Aremu, Anuoluwapo and
Chimoto, Everlyn Asiko and
Dunbar, Dale and
Morrissey, Graham and
Thior, Fadel and
Potgieter, Luandrie and
Ojo, Jessica and
Tonja, Atnafu Lambebo and
Nekoto, Wilhelmina NdapewaOnyothi and
Moiloa, Pelonomi and
Abbott, Jade and
Marivate, Vukosi and
Rosman, Benjamin",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1487/",
doi = "10.18653/v1/2025.acl-long.1487",
pages = "30763--30776",
ISBN = "979-8-89176-251-0",
abstract = "This paper presents the Esethu Framework, a sustainable data curation framework specifically designed to empower local communities and ensure equitable benefit-sharing from their linguistic resource. This framework is supported by the Esethu license, a novel community-centric data license. As a proof of concept, we introduce the Vuk{'}uzenzele isiXhosa Speech Dataset (ViXSD), an open-source corpus developed under the Esethu Framework and License. The dataset, containing read speech from native isiXhosa speakers enriched with demographic and linguistic metadata, demonstrates how community-driven licensing and curation principles can bridge resource gaps in automatic speech recognition (ASR) for African languages while safeguarding the interests of data creators. We describe the framework guiding dataset development, outline the Esethu license provisions, present the methodology for ViXSD, and present ASR experiments validating ViXSD{'}s usability in building and refining voice-driven applications for isiXhosa."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rajab-etal-2025-esethu">
<titleInfo>
<title>The Esethu Framework: Reimagining Sustainable Dataset Governance and Curation for Low-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jenalea</namePart>
<namePart type="family">Rajab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anuoluwapo</namePart>
<namePart type="family">Aremu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Everlyn</namePart>
<namePart type="given">Asiko</namePart>
<namePart type="family">Chimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dale</namePart>
<namePart type="family">Dunbar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Morrissey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fadel</namePart>
<namePart type="family">Thior</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luandrie</namePart>
<namePart type="family">Potgieter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jessica</namePart>
<namePart type="family">Ojo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atnafu</namePart>
<namePart type="given">Lambebo</namePart>
<namePart type="family">Tonja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wilhelmina</namePart>
<namePart type="given">NdapewaOnyothi</namePart>
<namePart type="family">Nekoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pelonomi</namePart>
<namePart type="family">Moiloa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jade</namePart>
<namePart type="family">Abbott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vukosi</namePart>
<namePart type="family">Marivate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Rosman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>This paper presents the Esethu Framework, a sustainable data curation framework specifically designed to empower local communities and ensure equitable benefit-sharing from their linguistic resource. This framework is supported by the Esethu license, a novel community-centric data license. As a proof of concept, we introduce the Vuk’uzenzele isiXhosa Speech Dataset (ViXSD), an open-source corpus developed under the Esethu Framework and License. The dataset, containing read speech from native isiXhosa speakers enriched with demographic and linguistic metadata, demonstrates how community-driven licensing and curation principles can bridge resource gaps in automatic speech recognition (ASR) for African languages while safeguarding the interests of data creators. We describe the framework guiding dataset development, outline the Esethu license provisions, present the methodology for ViXSD, and present ASR experiments validating ViXSD’s usability in building and refining voice-driven applications for isiXhosa.</abstract>
<identifier type="citekey">rajab-etal-2025-esethu</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1487</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1487/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>30763</start>
<end>30776</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Esethu Framework: Reimagining Sustainable Dataset Governance and Curation for Low-Resource Languages
%A Rajab, Jenalea
%A Aremu, Anuoluwapo
%A Chimoto, Everlyn Asiko
%A Dunbar, Dale
%A Morrissey, Graham
%A Thior, Fadel
%A Potgieter, Luandrie
%A Ojo, Jessica
%A Tonja, Atnafu Lambebo
%A Nekoto, Wilhelmina NdapewaOnyothi
%A Moiloa, Pelonomi
%A Abbott, Jade
%A Marivate, Vukosi
%A Rosman, Benjamin
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F rajab-etal-2025-esethu
%X This paper presents the Esethu Framework, a sustainable data curation framework specifically designed to empower local communities and ensure equitable benefit-sharing from their linguistic resource. This framework is supported by the Esethu license, a novel community-centric data license. As a proof of concept, we introduce the Vuk’uzenzele isiXhosa Speech Dataset (ViXSD), an open-source corpus developed under the Esethu Framework and License. The dataset, containing read speech from native isiXhosa speakers enriched with demographic and linguistic metadata, demonstrates how community-driven licensing and curation principles can bridge resource gaps in automatic speech recognition (ASR) for African languages while safeguarding the interests of data creators. We describe the framework guiding dataset development, outline the Esethu license provisions, present the methodology for ViXSD, and present ASR experiments validating ViXSD’s usability in building and refining voice-driven applications for isiXhosa.
%R 10.18653/v1/2025.acl-long.1487
%U https://aclanthology.org/2025.acl-long.1487/
%U https://doi.org/10.18653/v1/2025.acl-long.1487
%P 30763-30776
Markdown (Informal)
[The Esethu Framework: Reimagining Sustainable Dataset Governance and Curation for Low-Resource Languages](https://aclanthology.org/2025.acl-long.1487/) (Rajab et al., ACL 2025)
ACL
- Jenalea Rajab, Anuoluwapo Aremu, Everlyn Asiko Chimoto, Dale Dunbar, Graham Morrissey, Fadel Thior, Luandrie Potgieter, Jessica Ojo, Atnafu Lambebo Tonja, Wilhelmina NdapewaOnyothi Nekoto, Pelonomi Moiloa, Jade Abbott, Vukosi Marivate, and Benjamin Rosman. 2025. The Esethu Framework: Reimagining Sustainable Dataset Governance and Curation for Low-Resource Languages. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 30763–30776, Vienna, Austria. Association for Computational Linguistics.