@inproceedings{sekar-etal-2026-iiitk,
title = "{IIITK}{\_}{S}peech{S}cape@{D}ravidian{L}ang{T}ech 2026: Dialect based speech recognition and classification using Speech Foundation Models and Deep Learning Techniques",
author = "Sekar, G Srishtik and
Dhamodaran, Harissh Ragav and
S, Kishore Shankar and
Palani, Balasubramanian and
Tharaniya Sairaj, R",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Rajiakodi, Saranya and
Navaneethakrishnan, Subalalitha and
Chinnappa, Dhivya and
Palani, Balasubramanian and
Subramanian, Malliga and
Shanmugavadivel, Kogilavani and
Rajalakshmi, Ratnavel",
booktitle = "Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for {D}ravidian Languages",
month = jul,
year = "2026",
address = "Underline (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.dravidianlangtech-1.40/",
pages = "268--272",
ISBN = "979-8-89176-401-9",
abstract = "Dialectal variation poses a significant challenge to Automatic Speech Recognition (ASR), particularly for low resource morphologically rich languages such as Tamil. Although widely spoken in India, Sri Lanka, and the global diaspora, Tamil exhibits substantial phonetic, lexical, and prosodic variation across dialects, complicating both dialect classification and speech recognition. In this work, we address these tasks within a unified framework.We evaluate state-of-the-art models for dialect classification, including Whisper, CLDNN, wav2vec, and wavLM, and for ASR, Whisper and a zero-shot Conformer. Among them, Whisper achieves the best performance, obtaining a macro F1-score of 0.46 for dialect classification and a word error rate of 0.57 for ASR.These results highlight the strong generalization capability of transformer-based foundation models across dialects and languages. The code is publicly available in github for research purpose."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sekar-etal-2026-iiitk">
<titleInfo>
<title>IIITK_SpeechScape@DravidianLangTech 2026: Dialect based speech recognition and classification using Speech Foundation Models and Deep Learning Techniques</title>
</titleInfo>
<name type="personal">
<namePart type="given">G</namePart>
<namePart type="given">Srishtik</namePart>
<namePart type="family">Sekar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harissh</namePart>
<namePart type="given">Ragav</namePart>
<namePart type="family">Dhamodaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kishore</namePart>
<namePart type="given">Shankar</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balasubramanian</namePart>
<namePart type="family">Palani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="family">Tharaniya Sairaj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruba</namePart>
<namePart type="family">Priyadharshini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Madasamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sajeetha</namePart>
<namePart type="family">Thavareesan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saranya</namePart>
<namePart type="family">Rajiakodi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subalalitha</namePart>
<namePart type="family">Navaneethakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhivya</namePart>
<namePart type="family">Chinnappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balasubramanian</namePart>
<namePart type="family">Palani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malliga</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kogilavani</namePart>
<namePart type="family">Shanmugavadivel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ratnavel</namePart>
<namePart type="family">Rajalakshmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Underline (Virtual)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-401-9</identifier>
</relatedItem>
<abstract>Dialectal variation poses a significant challenge to Automatic Speech Recognition (ASR), particularly for low resource morphologically rich languages such as Tamil. Although widely spoken in India, Sri Lanka, and the global diaspora, Tamil exhibits substantial phonetic, lexical, and prosodic variation across dialects, complicating both dialect classification and speech recognition. In this work, we address these tasks within a unified framework.We evaluate state-of-the-art models for dialect classification, including Whisper, CLDNN, wav2vec, and wavLM, and for ASR, Whisper and a zero-shot Conformer. Among them, Whisper achieves the best performance, obtaining a macro F1-score of 0.46 for dialect classification and a word error rate of 0.57 for ASR.These results highlight the strong generalization capability of transformer-based foundation models across dialects and languages. The code is publicly available in github for research purpose.</abstract>
<identifier type="citekey">sekar-etal-2026-iiitk</identifier>
<location>
<url>https://aclanthology.org/2026.dravidianlangtech-1.40/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>268</start>
<end>272</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IIITK_SpeechScape@DravidianLangTech 2026: Dialect based speech recognition and classification using Speech Foundation Models and Deep Learning Techniques
%A Sekar, G. Srishtik
%A Dhamodaran, Harissh Ragav
%A S, Kishore Shankar
%A Palani, Balasubramanian
%A Tharaniya Sairaj, R.
%Y Chakravarthi, Bharathi Raja
%Y Priyadharshini, Ruba
%Y Madasamy, Anand Kumar
%Y Thavareesan, Sajeetha
%Y Rajiakodi, Saranya
%Y Navaneethakrishnan, Subalalitha
%Y Chinnappa, Dhivya
%Y Palani, Balasubramanian
%Y Subramanian, Malliga
%Y Shanmugavadivel, Kogilavani
%Y Rajalakshmi, Ratnavel
%S Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages
%D 2026
%8 July
%I Association for Computational Linguistics
%C Underline (Virtual)
%@ 979-8-89176-401-9
%F sekar-etal-2026-iiitk
%X Dialectal variation poses a significant challenge to Automatic Speech Recognition (ASR), particularly for low resource morphologically rich languages such as Tamil. Although widely spoken in India, Sri Lanka, and the global diaspora, Tamil exhibits substantial phonetic, lexical, and prosodic variation across dialects, complicating both dialect classification and speech recognition. In this work, we address these tasks within a unified framework.We evaluate state-of-the-art models for dialect classification, including Whisper, CLDNN, wav2vec, and wavLM, and for ASR, Whisper and a zero-shot Conformer. Among them, Whisper achieves the best performance, obtaining a macro F1-score of 0.46 for dialect classification and a word error rate of 0.57 for ASR.These results highlight the strong generalization capability of transformer-based foundation models across dialects and languages. The code is publicly available in github for research purpose.
%U https://aclanthology.org/2026.dravidianlangtech-1.40/
%P 268-272
Markdown (Informal)
[IIITK_SpeechScape@DravidianLangTech 2026: Dialect based speech recognition and classification using Speech Foundation Models and Deep Learning Techniques](https://aclanthology.org/2026.dravidianlangtech-1.40/) (Sekar et al., DravidianLangTech 2026)
ACL