@inproceedings{whetten-kennington-2023-evaluating,
title = "Evaluating and Improving Automatic Speech Recognition using Severity",
author = "Whetten, Ryan and
Kennington, Casey",
editor = "Demner-fushman, Dina and
Ananiadou, Sophia and
Cohen, Kevin",
booktitle = "The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.bionlp-1.6",
doi = "10.18653/v1/2023.bionlp-1.6",
pages = "79--91",
abstract = "A common metric for evaluating Automatic Speech Recognition (ASR) is Word Error Rate (WER) which solely takes into account discrepancies at the word-level. Although useful, WER is not guaranteed to correlate well with human judgment or performance on downstream tasks that use ASR. Meaningful assessment of ASR mistakes becomes even more important in high-stake scenarios such as health-care. We propose 2 general measures to evaluate the severity of mistakes made by ASR systems, one based on sentiment analysis and another based on text embeddings. We evaluate these measures on simulated patient-doctor conversations using 5 ASR systems. Results show that these measures capture characteristics of ASR errors that WER does not. Furthermore, we train an ASR system incorporating severity and demonstrate the potential for using severity not only in the evaluation, but in the development of ASR. Advantages and limitations of this methodology are analyzed and discussed.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="whetten-kennington-2023-evaluating">
<titleInfo>
<title>Evaluating and Improving Automatic Speech Recognition using Severity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Whetten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Casey</namePart>
<namePart type="family">Kennington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A common metric for evaluating Automatic Speech Recognition (ASR) is Word Error Rate (WER) which solely takes into account discrepancies at the word-level. Although useful, WER is not guaranteed to correlate well with human judgment or performance on downstream tasks that use ASR. Meaningful assessment of ASR mistakes becomes even more important in high-stake scenarios such as health-care. We propose 2 general measures to evaluate the severity of mistakes made by ASR systems, one based on sentiment analysis and another based on text embeddings. We evaluate these measures on simulated patient-doctor conversations using 5 ASR systems. Results show that these measures capture characteristics of ASR errors that WER does not. Furthermore, we train an ASR system incorporating severity and demonstrate the potential for using severity not only in the evaluation, but in the development of ASR. Advantages and limitations of this methodology are analyzed and discussed.</abstract>
<identifier type="citekey">whetten-kennington-2023-evaluating</identifier>
<identifier type="doi">10.18653/v1/2023.bionlp-1.6</identifier>
<location>
<url>https://aclanthology.org/2023.bionlp-1.6</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>79</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating and Improving Automatic Speech Recognition using Severity
%A Whetten, Ryan
%A Kennington, Casey
%Y Demner-fushman, Dina
%Y Ananiadou, Sophia
%Y Cohen, Kevin
%S The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F whetten-kennington-2023-evaluating
%X A common metric for evaluating Automatic Speech Recognition (ASR) is Word Error Rate (WER) which solely takes into account discrepancies at the word-level. Although useful, WER is not guaranteed to correlate well with human judgment or performance on downstream tasks that use ASR. Meaningful assessment of ASR mistakes becomes even more important in high-stake scenarios such as health-care. We propose 2 general measures to evaluate the severity of mistakes made by ASR systems, one based on sentiment analysis and another based on text embeddings. We evaluate these measures on simulated patient-doctor conversations using 5 ASR systems. Results show that these measures capture characteristics of ASR errors that WER does not. Furthermore, we train an ASR system incorporating severity and demonstrate the potential for using severity not only in the evaluation, but in the development of ASR. Advantages and limitations of this methodology are analyzed and discussed.
%R 10.18653/v1/2023.bionlp-1.6
%U https://aclanthology.org/2023.bionlp-1.6
%U https://doi.org/10.18653/v1/2023.bionlp-1.6
%P 79-91
Markdown (Informal)
[Evaluating and Improving Automatic Speech Recognition using Severity](https://aclanthology.org/2023.bionlp-1.6) (Whetten & Kennington, BioNLP 2023)
ACL