@inproceedings{kumar-etal-2026-post,
title = "Post-{ASR} Correction in {H}indi: Comparing Language Models and Large Language Models in Low-Resource Scenarios",
author = "Kumar, Rishabh and
Krishna, Amrith and
Ramakrishnan, Ganesh and
Jyothi, Preethi",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-short.45/",
pages = "636--645",
ISBN = "979-8-89176-381-4",
abstract = "Automatic Speech Recognition (ASR) systems for low-resource languages like Hindi often produce erroneous transcripts due to limited annotated data and linguistic complexity. **Post-ASR correction** using language models (LMs) and large language models (LLMs) offers a promising approach to improve transcription quality. In this work, we compare fine-tuned LMs (mT5, ByT5), fine-tuned LLMs (Nanda 10B), and instruction-tuned LLMs (GPT-4o-mini, LLaMA variants) for post-ASR correction in Hindi. Our findings reveal that **smaller, fine-tuned models** consistently **outperform larger LLMs** in both fine-tuning and in-context learning (ICL) settings. We observe a **U-shaped inverse scaling** trend under zero-shot ICL, where mid-sized LLMs degrade performance before marginal recovery at extreme scales, yet still fall short of fine-tuned models. **ByT5 is more effective for character-level corrections** such as transliteration and word segmentation, while **mT5 handles broader semantic inconsistencies**. We also identify performance drops in out-of-domain settings and propose **mitigation strategies** to preserve domain fidelity. In particular, we observe similar trends in **Marathi and Telugu**, indicating the broader applicability of our findings across low-resource Indian languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2026-post">
<titleInfo>
<title>Post-ASR Correction in Hindi: Comparing Language Models and Large Language Models in Low-Resource Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rishabh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amrith</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-381-4</identifier>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) systems for low-resource languages like Hindi often produce erroneous transcripts due to limited annotated data and linguistic complexity. **Post-ASR correction** using language models (LMs) and large language models (LLMs) offers a promising approach to improve transcription quality. In this work, we compare fine-tuned LMs (mT5, ByT5), fine-tuned LLMs (Nanda 10B), and instruction-tuned LLMs (GPT-4o-mini, LLaMA variants) for post-ASR correction in Hindi. Our findings reveal that **smaller, fine-tuned models** consistently **outperform larger LLMs** in both fine-tuning and in-context learning (ICL) settings. We observe a **U-shaped inverse scaling** trend under zero-shot ICL, where mid-sized LLMs degrade performance before marginal recovery at extreme scales, yet still fall short of fine-tuned models. **ByT5 is more effective for character-level corrections** such as transliteration and word segmentation, while **mT5 handles broader semantic inconsistencies**. We also identify performance drops in out-of-domain settings and propose **mitigation strategies** to preserve domain fidelity. In particular, we observe similar trends in **Marathi and Telugu**, indicating the broader applicability of our findings across low-resource Indian languages.</abstract>
<identifier type="citekey">kumar-etal-2026-post</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-short.45/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>636</start>
<end>645</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Post-ASR Correction in Hindi: Comparing Language Models and Large Language Models in Low-Resource Scenarios
%A Kumar, Rishabh
%A Krishna, Amrith
%A Ramakrishnan, Ganesh
%A Jyothi, Preethi
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-381-4
%F kumar-etal-2026-post
%X Automatic Speech Recognition (ASR) systems for low-resource languages like Hindi often produce erroneous transcripts due to limited annotated data and linguistic complexity. **Post-ASR correction** using language models (LMs) and large language models (LLMs) offers a promising approach to improve transcription quality. In this work, we compare fine-tuned LMs (mT5, ByT5), fine-tuned LLMs (Nanda 10B), and instruction-tuned LLMs (GPT-4o-mini, LLaMA variants) for post-ASR correction in Hindi. Our findings reveal that **smaller, fine-tuned models** consistently **outperform larger LLMs** in both fine-tuning and in-context learning (ICL) settings. We observe a **U-shaped inverse scaling** trend under zero-shot ICL, where mid-sized LLMs degrade performance before marginal recovery at extreme scales, yet still fall short of fine-tuned models. **ByT5 is more effective for character-level corrections** such as transliteration and word segmentation, while **mT5 handles broader semantic inconsistencies**. We also identify performance drops in out-of-domain settings and propose **mitigation strategies** to preserve domain fidelity. In particular, we observe similar trends in **Marathi and Telugu**, indicating the broader applicability of our findings across low-resource Indian languages.
%U https://aclanthology.org/2026.eacl-short.45/
%P 636-645
Markdown (Informal)
[Post-ASR Correction in Hindi: Comparing Language Models and Large Language Models in Low-Resource Scenarios](https://aclanthology.org/2026.eacl-short.45/) (Kumar et al., EACL 2026)
ACL