@inproceedings{yang-etal-2025-covoger,
title = "{C}o{V}o{GER}: A Multilingual Multitask Benchmark for Speech-to-text Generative Error Correction with Large Language Models",
author = "Yang, Zhengdong and
Wan, Zhen and
Li, Sheng and
Yang, Chao-Han Huck and
Chu, Chenhui",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.320/",
pages = "6313--6325",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) can rewrite the N-best hypotheses from a speech-to-text model, often fixing recognition or translation errors that traditional rescoring cannot. Yet research on generative error correction (GER) has been focusing on monolingual automatic speech recognition (ASR), leaving its multilingual and multitask potential underexplored. We introduce CoVoGER, a benchmark for GER that covers both ASR and speech-to-text translation (ST) across 15 languages and 28 language pairs. CoVoGER is constructed by decoding Common Voice 20.0 and CoVoST-2 with Whisper of three model sizes and SeamlessM4T of two model sizes, providing 5-best lists obtained via a mixture of beam search and temperature sampling. We evaluated various instruction-tuned LLMs, including commercial models in zero-shot mode and open-sourced models with LoRA fine-tuning, and found that the mixture decoding strategy yields the best GER performance in most settings. CoVoGER will be released to promote research on reliable language-universal speech-to-text GER. The code and data for the benchmark are available at https://github.com/N-Orien/CoVoGER."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2025-covoger">
<titleInfo>
<title>CoVoGER: A Multilingual Multitask Benchmark for Speech-to-text Generative Error Correction with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhengdong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-Han</namePart>
<namePart type="given">Huck</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhui</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) can rewrite the N-best hypotheses from a speech-to-text model, often fixing recognition or translation errors that traditional rescoring cannot. Yet research on generative error correction (GER) has been focusing on monolingual automatic speech recognition (ASR), leaving its multilingual and multitask potential underexplored. We introduce CoVoGER, a benchmark for GER that covers both ASR and speech-to-text translation (ST) across 15 languages and 28 language pairs. CoVoGER is constructed by decoding Common Voice 20.0 and CoVoST-2 with Whisper of three model sizes and SeamlessM4T of two model sizes, providing 5-best lists obtained via a mixture of beam search and temperature sampling. We evaluated various instruction-tuned LLMs, including commercial models in zero-shot mode and open-sourced models with LoRA fine-tuning, and found that the mixture decoding strategy yields the best GER performance in most settings. CoVoGER will be released to promote research on reliable language-universal speech-to-text GER. The code and data for the benchmark are available at https://github.com/N-Orien/CoVoGER.</abstract>
<identifier type="citekey">yang-etal-2025-covoger</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.320/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>6313</start>
<end>6325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CoVoGER: A Multilingual Multitask Benchmark for Speech-to-text Generative Error Correction with Large Language Models
%A Yang, Zhengdong
%A Wan, Zhen
%A Li, Sheng
%A Yang, Chao-Han Huck
%A Chu, Chenhui
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F yang-etal-2025-covoger
%X Large language models (LLMs) can rewrite the N-best hypotheses from a speech-to-text model, often fixing recognition or translation errors that traditional rescoring cannot. Yet research on generative error correction (GER) has been focusing on monolingual automatic speech recognition (ASR), leaving its multilingual and multitask potential underexplored. We introduce CoVoGER, a benchmark for GER that covers both ASR and speech-to-text translation (ST) across 15 languages and 28 language pairs. CoVoGER is constructed by decoding Common Voice 20.0 and CoVoST-2 with Whisper of three model sizes and SeamlessM4T of two model sizes, providing 5-best lists obtained via a mixture of beam search and temperature sampling. We evaluated various instruction-tuned LLMs, including commercial models in zero-shot mode and open-sourced models with LoRA fine-tuning, and found that the mixture decoding strategy yields the best GER performance in most settings. CoVoGER will be released to promote research on reliable language-universal speech-to-text GER. The code and data for the benchmark are available at https://github.com/N-Orien/CoVoGER.
%U https://aclanthology.org/2025.emnlp-main.320/
%P 6313-6325
Markdown (Informal)
[CoVoGER: A Multilingual Multitask Benchmark for Speech-to-text Generative Error Correction with Large Language Models](https://aclanthology.org/2025.emnlp-main.320/) (Yang et al., EMNLP 2025)
ACL