@inproceedings{dipto-etal-2025-asr,
title = "Are {ASR} foundation models generalized enough to capture features of regional dialects for low-resource languages?",
author = "Dipto, Tawsif Tashwar and
Azmol Hossain and
Faruque, Rubayet Sabbir and
Hassan, Md. Rezuwan and
Fatema, Kanij and
Shome, Tanmoy and
Naswan, Ruwad and
Zihad, Md.Foriduzzaman and
Anam, Mohaymen Ul and
Tasnim, Nazia and
Mahmud, Hasan and
Hasan, Md Kamrul and
Shawon, Md. Mehedi Hasan and
Sadeque, Farig and
Reasat, Tahsin",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-short.17/",
pages = "178--188",
ISBN = "979-8-89176-299-2",
abstract = "Conventional research on speech recognition modeling relies on the canonical form for most low-resource languages while automatic speech recognition (ASR) for regional dialects is treated as a fine-tuning task. To investigate the effects of dialectal variations on ASR we develop a 78-hour annotated Bengali Speech-to-Text (STT) corpus named Ben-10. Investigation from linguistic and data-driven perspectives shows that speech foundation models struggle heavily in regional dialect ASR, both in zero-shot and fine-tuned settings. We observe that all deep learning methods struggle to model speech data under dialectal variations, but dialect specific model training alleviates the issue. Our dataset also serves as a out-of-distribution (OOD) resource for ASR modeling under constrained resources in ASR algorithms. The dataset and code developed for this project are publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dipto-etal-2025-asr">
<titleInfo>
<title>Are ASR foundation models generalized enough to capture features of regional dialects for low-resource languages?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tawsif</namePart>
<namePart type="given">Tashwar</namePart>
<namePart type="family">Dipto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Azmol</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rubayet</namePart>
<namePart type="given">Sabbir</namePart>
<namePart type="family">Faruque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Rezuwan</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kanij</namePart>
<namePart type="family">Fatema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Shome</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruwad</namePart>
<namePart type="family">Naswan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.Foriduzzaman</namePart>
<namePart type="family">Zihad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohaymen</namePart>
<namePart type="given">Ul</namePart>
<namePart type="family">Anam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nazia</namePart>
<namePart type="family">Tasnim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hasan</namePart>
<namePart type="family">Mahmud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Kamrul</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Mehedi</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Shawon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farig</namePart>
<namePart type="family">Sadeque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tahsin</namePart>
<namePart type="family">Reasat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haofen</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biplab</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhirendra</namePart>
<namePart type="given">Pratap</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Asian Federation of Natural Language Processing and The Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-299-2</identifier>
</relatedItem>
<abstract>Conventional research on speech recognition modeling relies on the canonical form for most low-resource languages while automatic speech recognition (ASR) for regional dialects is treated as a fine-tuning task. To investigate the effects of dialectal variations on ASR we develop a 78-hour annotated Bengali Speech-to-Text (STT) corpus named Ben-10. Investigation from linguistic and data-driven perspectives shows that speech foundation models struggle heavily in regional dialect ASR, both in zero-shot and fine-tuned settings. We observe that all deep learning methods struggle to model speech data under dialectal variations, but dialect specific model training alleviates the issue. Our dataset also serves as a out-of-distribution (OOD) resource for ASR modeling under constrained resources in ASR algorithms. The dataset and code developed for this project are publicly available.</abstract>
<identifier type="citekey">dipto-etal-2025-asr</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-short.17/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>178</start>
<end>188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are ASR foundation models generalized enough to capture features of regional dialects for low-resource languages?
%A Dipto, Tawsif Tashwar
%A Hossain, Azmol
%A Faruque, Rubayet Sabbir
%A Hassan, Md. Rezuwan
%A Fatema, Kanij
%A Shome, Tanmoy
%A Naswan, Ruwad
%A Zihad, Md.Foriduzzaman
%A Anam, Mohaymen Ul
%A Tasnim, Nazia
%A Mahmud, Hasan
%A Hasan, Md Kamrul
%A Shawon, Md. Mehedi Hasan
%A Sadeque, Farig
%A Reasat, Tahsin
%Y Inui, Kentaro
%Y Sakti, Sakriani
%Y Wang, Haofen
%Y Wong, Derek F.
%Y Bhattacharyya, Pushpak
%Y Banerjee, Biplab
%Y Ekbal, Asif
%Y Chakraborty, Tanmoy
%Y Singh, Dhirendra Pratap
%S Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I The Asian Federation of Natural Language Processing and The Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-299-2
%F dipto-etal-2025-asr
%X Conventional research on speech recognition modeling relies on the canonical form for most low-resource languages while automatic speech recognition (ASR) for regional dialects is treated as a fine-tuning task. To investigate the effects of dialectal variations on ASR we develop a 78-hour annotated Bengali Speech-to-Text (STT) corpus named Ben-10. Investigation from linguistic and data-driven perspectives shows that speech foundation models struggle heavily in regional dialect ASR, both in zero-shot and fine-tuned settings. We observe that all deep learning methods struggle to model speech data under dialectal variations, but dialect specific model training alleviates the issue. Our dataset also serves as a out-of-distribution (OOD) resource for ASR modeling under constrained resources in ASR algorithms. The dataset and code developed for this project are publicly available.
%U https://aclanthology.org/2025.ijcnlp-short.17/
%P 178-188
Markdown (Informal)
[Are ASR foundation models generalized enough to capture features of regional dialects for low-resource languages?](https://aclanthology.org/2025.ijcnlp-short.17/) (Dipto et al., IJCNLP-AACL 2025)
ACL
- Tawsif Tashwar Dipto, Azmol Hossain, Rubayet Sabbir Faruque, Md. Rezuwan Hassan, Kanij Fatema, Tanmoy Shome, Ruwad Naswan, Md.Foriduzzaman Zihad, Mohaymen Ul Anam, Nazia Tasnim, Hasan Mahmud, Md Kamrul Hasan, Md. Mehedi Hasan Shawon, Farig Sadeque, and Tahsin Reasat. 2025. Are ASR foundation models generalized enough to capture features of regional dialects for low-resource languages?. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 178–188, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.