@inproceedings{parsons-etal-2025-match,
title = "Match `em: {Multi-Tiered} Alignment for Error Analysis in {ASR}",
author = "Parsons, Phoebe and
Kvale, Knut and
Svendsen, Torbj{\o}rn and
Salvi, Giampiero",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.48/",
pages = "440--447",
ISBN = "978-9908-53-109-0",
abstract = "We introduce ``Match `em'': a new framework for aligning output from automatic speech recognition (ASR) with reference transcriptions. This allows a more detailed analysis of errors produced by end-to-end ASR systems compared to word error rate (WER). Match `em performs the alignment on both the word and character level; each relying on information from the other to provide the most meaningful global alignment. At the character level, we define a speech production motivated character similarity metric. At the word level, we rely on character similarities to define word similarity and, additionally, we reconcile compounding (insertion or deletion of spaces). We evaluated Match `em on transcripts of three European languages produced by wav2vec2 and Whisper. We show that Match `em results in more similar word substitution pairs and that compound reconciling can capture a broad range of spacing errors. We believe Match `em to be a valuable tool for ASR error analysis across many languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parsons-etal-2025-match">
<titleInfo>
<title>Match ‘em: Multi-Tiered Alignment for Error Analysis in ASR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Phoebe</namePart>
<namePart type="family">Parsons</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Knut</namePart>
<namePart type="family">Kvale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Torbjørn</namePart>
<namePart type="family">Svendsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giampiero</namePart>
<namePart type="family">Salvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>We introduce “Match ‘em”: a new framework for aligning output from automatic speech recognition (ASR) with reference transcriptions. This allows a more detailed analysis of errors produced by end-to-end ASR systems compared to word error rate (WER). Match ‘em performs the alignment on both the word and character level; each relying on information from the other to provide the most meaningful global alignment. At the character level, we define a speech production motivated character similarity metric. At the word level, we rely on character similarities to define word similarity and, additionally, we reconcile compounding (insertion or deletion of spaces). We evaluated Match ‘em on transcripts of three European languages produced by wav2vec2 and Whisper. We show that Match ‘em results in more similar word substitution pairs and that compound reconciling can capture a broad range of spacing errors. We believe Match ‘em to be a valuable tool for ASR error analysis across many languages.</abstract>
<identifier type="citekey">parsons-etal-2025-match</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.48/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>440</start>
<end>447</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Match ‘em: Multi-Tiered Alignment for Error Analysis in ASR
%A Parsons, Phoebe
%A Kvale, Knut
%A Svendsen, Torbjørn
%A Salvi, Giampiero
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F parsons-etal-2025-match
%X We introduce “Match ‘em”: a new framework for aligning output from automatic speech recognition (ASR) with reference transcriptions. This allows a more detailed analysis of errors produced by end-to-end ASR systems compared to word error rate (WER). Match ‘em performs the alignment on both the word and character level; each relying on information from the other to provide the most meaningful global alignment. At the character level, we define a speech production motivated character similarity metric. At the word level, we rely on character similarities to define word similarity and, additionally, we reconcile compounding (insertion or deletion of spaces). We evaluated Match ‘em on transcripts of three European languages produced by wav2vec2 and Whisper. We show that Match ‘em results in more similar word substitution pairs and that compound reconciling can capture a broad range of spacing errors. We believe Match ‘em to be a valuable tool for ASR error analysis across many languages.
%U https://aclanthology.org/2025.nodalida-1.48/
%P 440-447
Markdown (Informal)
[Match ‘em: Multi-Tiered Alignment for Error Analysis in ASR](https://aclanthology.org/2025.nodalida-1.48/) (Parsons et al., NoDaLiDa 2025)
ACL
- Phoebe Parsons, Knut Kvale, Torbjørn Svendsen, and Giampiero Salvi. 2025. Match ‘em: Multi-Tiered Alignment for Error Analysis in ASR. In Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), pages 440–447, Tallinn, Estonia. University of Tartu Library.