@inproceedings{srirag-etal-2025-evaluating,
title = "Evaluating Dialect Robustness of Language Models via Conversation Understanding",
author = "Srirag, Dipankar and
Sahoo, Nihar Ranjan and
Joshi, Aditya",
booktitle = "Proceedings of the Second Workshop on Scaling Up Multilingual {\&} Multi-Cultural Evaluation",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sumeval-2.3/",
pages = "24--38",
abstract = "With an evergrowing number of LLMs reporting superlative performance for English, their ability to perform equitably for different dialects of English (i.e., dialect robustness) needs to be ascertained. Specifically, we use English language (US English or Indian English) conversations between humans who play the word-guessing game of {\textquoteleft}taboo{\textquoteleft}. We formulate two evaluative tasks: target word prediction (TWP) (i.e., predict the masked target word in a conversation) and target word selection (TWS) (i.e., select the most likely masked target word in a conversation, from among a set of candidate words). Extending MD3, an existing dialectic dataset of taboo-playing conversations, we introduce M-MD3, a target-word-masked version of MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where en-US is transformed to include dialectal information) and en-TR (where dialectal information is removed from en-IN). We evaluate three multilingual LLMs{--}one open source (Llama3) and two closed-source (GPT-4/3.5). LLMs perform significantly better for US English than Indian English for both TWP and TWS tasks, for all settings, exhibiting marginalisation against the Indian dialect of English. While GPT-based models perform the best, the comparatively smaller models work more equitably after fine-tuning. Our evaluation methodology exhibits a novel and reproducible way to examine attributes of language models using pre-existing dialogue datasets with language varieties. Dialect being an artifact of one`s culture, this paper demonstrates the gap in the performance of multilingual LLMs for communities that do not use a mainstream dialect."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="srirag-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Dialect Robustness of Language Models via Conversation Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dipankar</namePart>
<namePart type="family">Srirag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nihar</namePart>
<namePart type="given">Ranjan</namePart>
<namePart type="family">Sahoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Scaling Up Multilingual & Multi-Cultural Evaluation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With an evergrowing number of LLMs reporting superlative performance for English, their ability to perform equitably for different dialects of English (i.e., dialect robustness) needs to be ascertained. Specifically, we use English language (US English or Indian English) conversations between humans who play the word-guessing game of ‘taboo‘. We formulate two evaluative tasks: target word prediction (TWP) (i.e., predict the masked target word in a conversation) and target word selection (TWS) (i.e., select the most likely masked target word in a conversation, from among a set of candidate words). Extending MD3, an existing dialectic dataset of taboo-playing conversations, we introduce M-MD3, a target-word-masked version of MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where en-US is transformed to include dialectal information) and en-TR (where dialectal information is removed from en-IN). We evaluate three multilingual LLMs–one open source (Llama3) and two closed-source (GPT-4/3.5). LLMs perform significantly better for US English than Indian English for both TWP and TWS tasks, for all settings, exhibiting marginalisation against the Indian dialect of English. While GPT-based models perform the best, the comparatively smaller models work more equitably after fine-tuning. Our evaluation methodology exhibits a novel and reproducible way to examine attributes of language models using pre-existing dialogue datasets with language varieties. Dialect being an artifact of one‘s culture, this paper demonstrates the gap in the performance of multilingual LLMs for communities that do not use a mainstream dialect.</abstract>
<identifier type="citekey">srirag-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.sumeval-2.3/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>24</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Dialect Robustness of Language Models via Conversation Understanding
%A Srirag, Dipankar
%A Sahoo, Nihar Ranjan
%A Joshi, Aditya
%S Proceedings of the Second Workshop on Scaling Up Multilingual & Multi-Cultural Evaluation
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F srirag-etal-2025-evaluating
%X With an evergrowing number of LLMs reporting superlative performance for English, their ability to perform equitably for different dialects of English (i.e., dialect robustness) needs to be ascertained. Specifically, we use English language (US English or Indian English) conversations between humans who play the word-guessing game of ‘taboo‘. We formulate two evaluative tasks: target word prediction (TWP) (i.e., predict the masked target word in a conversation) and target word selection (TWS) (i.e., select the most likely masked target word in a conversation, from among a set of candidate words). Extending MD3, an existing dialectic dataset of taboo-playing conversations, we introduce M-MD3, a target-word-masked version of MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where en-US is transformed to include dialectal information) and en-TR (where dialectal information is removed from en-IN). We evaluate three multilingual LLMs–one open source (Llama3) and two closed-source (GPT-4/3.5). LLMs perform significantly better for US English than Indian English for both TWP and TWS tasks, for all settings, exhibiting marginalisation against the Indian dialect of English. While GPT-based models perform the best, the comparatively smaller models work more equitably after fine-tuning. Our evaluation methodology exhibits a novel and reproducible way to examine attributes of language models using pre-existing dialogue datasets with language varieties. Dialect being an artifact of one‘s culture, this paper demonstrates the gap in the performance of multilingual LLMs for communities that do not use a mainstream dialect.
%U https://aclanthology.org/2025.sumeval-2.3/
%P 24-38
Markdown (Informal)
[Evaluating Dialect Robustness of Language Models via Conversation Understanding](https://aclanthology.org/2025.sumeval-2.3/) (Srirag et al., SUMEval 2025)
ACL