@inproceedings{kriukova-etal-2026-data,
title = {A data-centric approach to performance improvement in under-resourced {ASR}: The case of D{\"e}n{\"e} S{\k{u}}{\l}{\i}n{\'e}},
author = "Kriukova, Olga and
Lovick, Olga and
Arppe, Antti",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.americasnlp-6.9/",
pages = "95--106",
ISBN = "979-8-89176-415-6",
abstract = {This paper presents a study focused on advancing Automatic Speech Recognition (ASR) for the under-resourced language D{\"e}n{\"e} S{\k{u}}{\l}{\i}n{\'e} through data-centric approaches. We explore multiple strategies to enhance the quality of training data{---}both audio recordings and transcriptions{---}to address the challenges posed by mixed-quality datasets. Our experiments investigate which data preparation techniques most effectively improve ASR performance in this context. Our findings show that reducing non-phonemic spelling variation in the corpus significantly improves model generalization, resulting in a substantial increase in recognition accuracy. Additionally, we demonstrate that increasing manually reviewed transcriptions consistently improves word and character error rates, while audio enhancement slightly reduces performance, highlighting the complex trade-offs in low-resource ASR development.}
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kriukova-etal-2026-data">
<titleInfo>
<title>A data-centric approach to performance improvement in under-resourced ASR: The case of Dënë Sųłıné</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Kriukova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Lovick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Arppe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minh</namePart>
<namePart type="given">Duc</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="given">Coto</namePart>
<namePart type="family">Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Von Der Wense</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-415-6</identifier>
</relatedItem>
<abstract>This paper presents a study focused on advancing Automatic Speech Recognition (ASR) for the under-resourced language Dënë Sųłıné through data-centric approaches. We explore multiple strategies to enhance the quality of training data—both audio recordings and transcriptions—to address the challenges posed by mixed-quality datasets. Our experiments investigate which data preparation techniques most effectively improve ASR performance in this context. Our findings show that reducing non-phonemic spelling variation in the corpus significantly improves model generalization, resulting in a substantial increase in recognition accuracy. Additionally, we demonstrate that increasing manually reviewed transcriptions consistently improves word and character error rates, while audio enhancement slightly reduces performance, highlighting the complex trade-offs in low-resource ASR development.</abstract>
<identifier type="citekey">kriukova-etal-2026-data</identifier>
<location>
<url>https://aclanthology.org/2026.americasnlp-6.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>95</start>
<end>106</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A data-centric approach to performance improvement in under-resourced ASR: The case of Dënë Sųłıné
%A Kriukova, Olga
%A Lovick, Olga
%A Arppe, Antti
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Bui, Minh Duc
%Y Pugh, Robert
%Y Oncevay, Arturo
%Y Chiruzzo, Luis
%Y Solano, Rolando Coto
%Y Rijhwani, Shruti
%Y Von Der Wense, Katharina
%S Proceedings of the Sixth Workshop on NLP for Indigenous Languages of the Americas (AmericasNLP)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-415-6
%F kriukova-etal-2026-data
%X This paper presents a study focused on advancing Automatic Speech Recognition (ASR) for the under-resourced language Dënë Sųłıné through data-centric approaches. We explore multiple strategies to enhance the quality of training data—both audio recordings and transcriptions—to address the challenges posed by mixed-quality datasets. Our experiments investigate which data preparation techniques most effectively improve ASR performance in this context. Our findings show that reducing non-phonemic spelling variation in the corpus significantly improves model generalization, resulting in a substantial increase in recognition accuracy. Additionally, we demonstrate that increasing manually reviewed transcriptions consistently improves word and character error rates, while audio enhancement slightly reduces performance, highlighting the complex trade-offs in low-resource ASR development.
%U https://aclanthology.org/2026.americasnlp-6.9/
%P 95-106
Markdown (Informal)
[A data-centric approach to performance improvement in under-resourced ASR: The case of Dënë Sųłıné](https://aclanthology.org/2026.americasnlp-6.9/) (Kriukova et al., AmericasNLP 2026)
ACL