@inproceedings{gundam-mamidi-2026-telugueval,
title = "{T}elugu{E}val: A Comprehensive Benchmark for Evaluating {LLM} Capabilities in {T}elugu",
author = "Gundam, Revanth Kumar and
Mamidi, Radhika",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.20/",
pages = "212--224",
ISBN = "979-8-89176-377-7",
abstract = "Large Language Models (LLMs) excel on English reasoning tasks but falter on morphologically rich, low-resource languages such as Telugu, Tamil, and Kannada. We present TeluguEval, a human-curated reasoning benchmark created by translating GSM8K (math), Winogrande (commonsense), ARC (science), CaseHOLD (law), and Hendrycks Ethics into Telugu. We evaluate eight models spanning global (Llama-3.1-8B, Llama-2-7B, Qwen-8B, Gemma-7B, Gemini-2.0) and regional (Telugu-Llama2-7B, Indic-Gemma-7B, Sarvam-m-24B) systems. While extremely strong models such as Gemini and Sarvam-m largely retain performance in Telugu, most English-centric models suffer severe accuracy drops, often exceeding 30 to 40 points, particularly on mathematical and scientific reasoning. We further observe systematic failure modes including script sensitivity, option-selection bias, repetition loops, and unintended code-switching. Our results demonstrate that surface-level Telugu fluency does not imply robust reasoning capability, underscoring the need for Telugu-specific data, tokenization, and pretraining. TeluguEval provides a standardized testbed to drive progress on reasoning in low-resource Indian languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gundam-mamidi-2026-telugueval">
<titleInfo>
<title>TeluguEval: A Comprehensive Benchmark for Evaluating LLM Capabilities in Telugu</title>
</titleInfo>
<name type="personal">
<namePart type="given">Revanth</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Gundam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radhika</namePart>
<namePart type="family">Mamidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) excel on English reasoning tasks but falter on morphologically rich, low-resource languages such as Telugu, Tamil, and Kannada. We present TeluguEval, a human-curated reasoning benchmark created by translating GSM8K (math), Winogrande (commonsense), ARC (science), CaseHOLD (law), and Hendrycks Ethics into Telugu. We evaluate eight models spanning global (Llama-3.1-8B, Llama-2-7B, Qwen-8B, Gemma-7B, Gemini-2.0) and regional (Telugu-Llama2-7B, Indic-Gemma-7B, Sarvam-m-24B) systems. While extremely strong models such as Gemini and Sarvam-m largely retain performance in Telugu, most English-centric models suffer severe accuracy drops, often exceeding 30 to 40 points, particularly on mathematical and scientific reasoning. We further observe systematic failure modes including script sensitivity, option-selection bias, repetition loops, and unintended code-switching. Our results demonstrate that surface-level Telugu fluency does not imply robust reasoning capability, underscoring the need for Telugu-specific data, tokenization, and pretraining. TeluguEval provides a standardized testbed to drive progress on reasoning in low-resource Indian languages.</abstract>
<identifier type="citekey">gundam-mamidi-2026-telugueval</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.20/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>212</start>
<end>224</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TeluguEval: A Comprehensive Benchmark for Evaluating LLM Capabilities in Telugu
%A Gundam, Revanth Kumar
%A Mamidi, Radhika
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F gundam-mamidi-2026-telugueval
%X Large Language Models (LLMs) excel on English reasoning tasks but falter on morphologically rich, low-resource languages such as Telugu, Tamil, and Kannada. We present TeluguEval, a human-curated reasoning benchmark created by translating GSM8K (math), Winogrande (commonsense), ARC (science), CaseHOLD (law), and Hendrycks Ethics into Telugu. We evaluate eight models spanning global (Llama-3.1-8B, Llama-2-7B, Qwen-8B, Gemma-7B, Gemini-2.0) and regional (Telugu-Llama2-7B, Indic-Gemma-7B, Sarvam-m-24B) systems. While extremely strong models such as Gemini and Sarvam-m largely retain performance in Telugu, most English-centric models suffer severe accuracy drops, often exceeding 30 to 40 points, particularly on mathematical and scientific reasoning. We further observe systematic failure modes including script sensitivity, option-selection bias, repetition loops, and unintended code-switching. Our results demonstrate that surface-level Telugu fluency does not imply robust reasoning capability, underscoring the need for Telugu-specific data, tokenization, and pretraining. TeluguEval provides a standardized testbed to drive progress on reasoning in low-resource Indian languages.
%U https://aclanthology.org/2026.loreslm-1.20/
%P 212-224
Markdown (Informal)
[TeluguEval: A Comprehensive Benchmark for Evaluating LLM Capabilities in Telugu](https://aclanthology.org/2026.loreslm-1.20/) (Gundam & Mamidi, LoResLM 2026)
ACL