@inproceedings{faisal-anastasopoulos-2025-testing,
title = "Testing the Boundaries of {LLM}s: Dialectal and Language-Variety Tasks",
author = "Faisal, Fahim and
Anastasopoulos, Antonios",
editor = "Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, Jorg and
Zampieri, Marcos",
booktitle = "Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.vardial-1.6/",
pages = "68--92",
abstract = "This study evaluates the performance of large language models (LLMs) on benchmark datasets designed for dialect-specific NLP tasks. Dialectal NLP is a low-resource field, yet it is crucial for evaluating the robustness of language models against linguistic diversity. This work is the first to systematically compare state-of-the-art instruction-tuned LLMs{---}both open-weight multilingual and closed-weight generative models{---}with encoder-based models that rely on supervised task-specific fine-tuning for dialectal tasks. We conduct extensive empirical analyses to provide insights into the current LLM landscape for dialect-focused tasks. Our findings indicate that certain tasks, such as dialect identification, are challenging for LLMs to replicate effectively due to the complexity of multi-class setups and the suitability of these tasks for supervised fine-tuning. Additionally, the structure of task labels{---}whether categorical or continuous scoring{---}significantly affects model performance. While LLMs excel in tasks like machine reading comprehension, their instruction-following ability declines in simpler tasks like POS tagging when task instructions are inherently complex. Overall, subtle variations in prompt design can greatly impact performance, underscoring the need for careful prompt engineering in dialectal evaluations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="faisal-anastasopoulos-2025-testing">
<titleInfo>
<title>Testing the Boundaries of LLMs: Dialectal and Language-Variety Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fahim</namePart>
<namePart type="family">Faisal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study evaluates the performance of large language models (LLMs) on benchmark datasets designed for dialect-specific NLP tasks. Dialectal NLP is a low-resource field, yet it is crucial for evaluating the robustness of language models against linguistic diversity. This work is the first to systematically compare state-of-the-art instruction-tuned LLMs—both open-weight multilingual and closed-weight generative models—with encoder-based models that rely on supervised task-specific fine-tuning for dialectal tasks. We conduct extensive empirical analyses to provide insights into the current LLM landscape for dialect-focused tasks. Our findings indicate that certain tasks, such as dialect identification, are challenging for LLMs to replicate effectively due to the complexity of multi-class setups and the suitability of these tasks for supervised fine-tuning. Additionally, the structure of task labels—whether categorical or continuous scoring—significantly affects model performance. While LLMs excel in tasks like machine reading comprehension, their instruction-following ability declines in simpler tasks like POS tagging when task instructions are inherently complex. Overall, subtle variations in prompt design can greatly impact performance, underscoring the need for careful prompt engineering in dialectal evaluations.</abstract>
<identifier type="citekey">faisal-anastasopoulos-2025-testing</identifier>
<location>
<url>https://aclanthology.org/2025.vardial-1.6/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>68</start>
<end>92</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Testing the Boundaries of LLMs: Dialectal and Language-Variety Tasks
%A Faisal, Fahim
%A Anastasopoulos, Antonios
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Nakov, Preslav
%Y Tiedemann, Jorg
%Y Zampieri, Marcos
%S Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F faisal-anastasopoulos-2025-testing
%X This study evaluates the performance of large language models (LLMs) on benchmark datasets designed for dialect-specific NLP tasks. Dialectal NLP is a low-resource field, yet it is crucial for evaluating the robustness of language models against linguistic diversity. This work is the first to systematically compare state-of-the-art instruction-tuned LLMs—both open-weight multilingual and closed-weight generative models—with encoder-based models that rely on supervised task-specific fine-tuning for dialectal tasks. We conduct extensive empirical analyses to provide insights into the current LLM landscape for dialect-focused tasks. Our findings indicate that certain tasks, such as dialect identification, are challenging for LLMs to replicate effectively due to the complexity of multi-class setups and the suitability of these tasks for supervised fine-tuning. Additionally, the structure of task labels—whether categorical or continuous scoring—significantly affects model performance. While LLMs excel in tasks like machine reading comprehension, their instruction-following ability declines in simpler tasks like POS tagging when task instructions are inherently complex. Overall, subtle variations in prompt design can greatly impact performance, underscoring the need for careful prompt engineering in dialectal evaluations.
%U https://aclanthology.org/2025.vardial-1.6/
%P 68-92
Markdown (Informal)
[Testing the Boundaries of LLMs: Dialectal and Language-Variety Tasks](https://aclanthology.org/2025.vardial-1.6/) (Faisal & Anastasopoulos, VarDial 2025)
ACL