@inproceedings{singh-warstadt-2026-fine,
title = "Fine-tuning Whisper Across 81 Languages",
author = "Singh, Shivam and
Warstadt, Alex",
editor = "Voigt, Rob and
Warstadt, Alex and
Feldman, Naomi and
Linzen, Tal",
booktitle = "Proceedings of the Society for Computation in Linguistics 2026",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.scil-main.37/",
pages = "408--410",
ISBN = "979-8-89176-412-5",
abstract = "We fine-tune Whisper large-v3 independently on each of the 81 languages in the FLEURS benchmark. Fine-tuning improves WER for all 81 languages, reducing it by nearly 30{\%} on average. However, improvement varies widely, and the language{'}s writing system is the best predictor of success. Latin and Cyrillic script languages reach single-digit WERs, while languages with unique scripts (Thai, Georgian, Burmese, Khmer) benefit least. We further show that Whisper{'}s BPE compression ratio predicts fine-tuning headroom (Spearman {\ensuremath{\rho}} {\ensuremath{\approx}} {\ensuremath{-}}0.78), pointing to tokenization as the underlying bottleneck. We will release model weights upon publication."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="singh-warstadt-2026-fine">
<titleInfo>
<title>Fine-tuning Whisper Across 81 Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shivam</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Society for Computation in Linguistics 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Voigt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naomi</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-412-5</identifier>
</relatedItem>
<abstract>We fine-tune Whisper large-v3 independently on each of the 81 languages in the FLEURS benchmark. Fine-tuning improves WER for all 81 languages, reducing it by nearly 30% on average. However, improvement varies widely, and the language’s writing system is the best predictor of success. Latin and Cyrillic script languages reach single-digit WERs, while languages with unique scripts (Thai, Georgian, Burmese, Khmer) benefit least. We further show that Whisper’s BPE compression ratio predicts fine-tuning headroom (Spearman \ensuremathρ \ensuremath\approx \ensuremath-0.78), pointing to tokenization as the underlying bottleneck. We will release model weights upon publication.</abstract>
<identifier type="citekey">singh-warstadt-2026-fine</identifier>
<location>
<url>https://aclanthology.org/2026.scil-main.37/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>408</start>
<end>410</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-tuning Whisper Across 81 Languages
%A Singh, Shivam
%A Warstadt, Alex
%Y Voigt, Rob
%Y Warstadt, Alex
%Y Feldman, Naomi
%Y Linzen, Tal
%S Proceedings of the Society for Computation in Linguistics 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-412-5
%F singh-warstadt-2026-fine
%X We fine-tune Whisper large-v3 independently on each of the 81 languages in the FLEURS benchmark. Fine-tuning improves WER for all 81 languages, reducing it by nearly 30% on average. However, improvement varies widely, and the language’s writing system is the best predictor of success. Latin and Cyrillic script languages reach single-digit WERs, while languages with unique scripts (Thai, Georgian, Burmese, Khmer) benefit least. We further show that Whisper’s BPE compression ratio predicts fine-tuning headroom (Spearman \ensuremathρ \ensuremath\approx \ensuremath-0.78), pointing to tokenization as the underlying bottleneck. We will release model weights upon publication.
%U https://aclanthology.org/2026.scil-main.37/
%P 408-410
Markdown (Informal)
[Fine-tuning Whisper Across 81 Languages](https://aclanthology.org/2026.scil-main.37/) (Singh & Warstadt, SCiL 2026)
ACL
- Shivam Singh and Alex Warstadt. 2026. Fine-tuning Whisper Across 81 Languages. In Proceedings of the Society for Computation in Linguistics 2026, pages 408–410, San Diego, CA. Association for Computational Linguistics.