@inproceedings{radhakrishnan-etal-2023-whispering,
title = "Whispering {LL}a{MA}: A Cross-Modal Generative Error Correction Framework for Speech Recognition",
author = "Radhakrishnan, Srijith and
Yang, Chao-Han and
Khan, Sumeer and
Kumar, Rohit and
Kiani, Narsis and
Gomez-Cabrero, David and
Tegn{\'e}r, Jesper",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.618",
doi = "10.18653/v1/2023.emnlp-main.618",
pages = "10007--10016",
abstract = "We introduce a new cross-modal fusion technique designed for generative error correction in automatic speech recognition (ASR). Our methodology leverages both acoustic information and external linguistic representations to generate accurate speech transcription contexts. This marks a step towards a fresh paradigm in generative error correction within the realm of n-best hypotheses. Unlike the existing ranking-based rescoring methods, our approach adeptly uses distinct initialization techniques and parameter-efficient algorithms to boost ASR performance derived from pre-trained speech and text models. Through evaluation across diverse ASR datasets, we assess our fusion technique, demonstrating a 37.66{\%} improvement in word error rate (WER) relative performance compared to the n-best Oracle. To encourage future research, we have made our code and pre-trained models open source at [https://github.com/Srijith-rkr/Whispering-LLaMA](https://github.com/Srijith-rkr/Whispering-LLaMA)",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="radhakrishnan-etal-2023-whispering">
<titleInfo>
<title>Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Srijith</namePart>
<namePart type="family">Radhakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao-Han</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sumeer</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohit</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Narsis</namePart>
<namePart type="family">Kiani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Gomez-Cabrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesper</namePart>
<namePart type="family">Tegnér</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce a new cross-modal fusion technique designed for generative error correction in automatic speech recognition (ASR). Our methodology leverages both acoustic information and external linguistic representations to generate accurate speech transcription contexts. This marks a step towards a fresh paradigm in generative error correction within the realm of n-best hypotheses. Unlike the existing ranking-based rescoring methods, our approach adeptly uses distinct initialization techniques and parameter-efficient algorithms to boost ASR performance derived from pre-trained speech and text models. Through evaluation across diverse ASR datasets, we assess our fusion technique, demonstrating a 37.66% improvement in word error rate (WER) relative performance compared to the n-best Oracle. To encourage future research, we have made our code and pre-trained models open source at [https://github.com/Srijith-rkr/Whispering-LLaMA](https://github.com/Srijith-rkr/Whispering-LLaMA)</abstract>
<identifier type="citekey">radhakrishnan-etal-2023-whispering</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.618</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.618</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>10007</start>
<end>10016</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition
%A Radhakrishnan, Srijith
%A Yang, Chao-Han
%A Khan, Sumeer
%A Kumar, Rohit
%A Kiani, Narsis
%A Gomez-Cabrero, David
%A Tegnér, Jesper
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F radhakrishnan-etal-2023-whispering
%X We introduce a new cross-modal fusion technique designed for generative error correction in automatic speech recognition (ASR). Our methodology leverages both acoustic information and external linguistic representations to generate accurate speech transcription contexts. This marks a step towards a fresh paradigm in generative error correction within the realm of n-best hypotheses. Unlike the existing ranking-based rescoring methods, our approach adeptly uses distinct initialization techniques and parameter-efficient algorithms to boost ASR performance derived from pre-trained speech and text models. Through evaluation across diverse ASR datasets, we assess our fusion technique, demonstrating a 37.66% improvement in word error rate (WER) relative performance compared to the n-best Oracle. To encourage future research, we have made our code and pre-trained models open source at [https://github.com/Srijith-rkr/Whispering-LLaMA](https://github.com/Srijith-rkr/Whispering-LLaMA)
%R 10.18653/v1/2023.emnlp-main.618
%U https://aclanthology.org/2023.emnlp-main.618
%U https://doi.org/10.18653/v1/2023.emnlp-main.618
%P 10007-10016
Markdown (Informal)
[Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition](https://aclanthology.org/2023.emnlp-main.618) (Radhakrishnan et al., EMNLP 2023)
ACL