@inproceedings{deutsch-roth-2022-benchmarking,
title = "Benchmarking Answer Verification Methods for Question Answering-Based Summarization Evaluation Metrics",
author = "Deutsch, Daniel and
Roth, Dan",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.296",
doi = "10.18653/v1/2022.findings-acl.296",
pages = "3759--3765",
abstract = "Question answering-based summarization evaluation metrics must automatically determine whether the QA model{'}s prediction is correct or not, a task known as answer verification. In this work, we benchmark the lexical answer verification methods which have been used by current QA-based metrics as well as two more sophisticated text comparison methods, BERTScore and LERC. We find that LERC out-performs the other methods in some settings while remaining statistically indistinguishable from lexical overlap in others. However, our experiments reveal that improved verification performance does not necessarily translate to overall QA-based metric quality: In some scenarios, using a worse verification method {---} or using none at all {---} has comparable performance to using the best verification method, a result that we attribute to properties of the datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deutsch-roth-2022-benchmarking">
<titleInfo>
<title>Benchmarking Answer Verification Methods for Question Answering-Based Summarization Evaluation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Question answering-based summarization evaluation metrics must automatically determine whether the QA model’s prediction is correct or not, a task known as answer verification. In this work, we benchmark the lexical answer verification methods which have been used by current QA-based metrics as well as two more sophisticated text comparison methods, BERTScore and LERC. We find that LERC out-performs the other methods in some settings while remaining statistically indistinguishable from lexical overlap in others. However, our experiments reveal that improved verification performance does not necessarily translate to overall QA-based metric quality: In some scenarios, using a worse verification method — or using none at all — has comparable performance to using the best verification method, a result that we attribute to properties of the datasets.</abstract>
<identifier type="citekey">deutsch-roth-2022-benchmarking</identifier>
<identifier type="doi">10.18653/v1/2022.findings-acl.296</identifier>
<location>
<url>https://aclanthology.org/2022.findings-acl.296</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>3759</start>
<end>3765</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Answer Verification Methods for Question Answering-Based Summarization Evaluation Metrics
%A Deutsch, Daniel
%A Roth, Dan
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Findings of the Association for Computational Linguistics: ACL 2022
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F deutsch-roth-2022-benchmarking
%X Question answering-based summarization evaluation metrics must automatically determine whether the QA model’s prediction is correct or not, a task known as answer verification. In this work, we benchmark the lexical answer verification methods which have been used by current QA-based metrics as well as two more sophisticated text comparison methods, BERTScore and LERC. We find that LERC out-performs the other methods in some settings while remaining statistically indistinguishable from lexical overlap in others. However, our experiments reveal that improved verification performance does not necessarily translate to overall QA-based metric quality: In some scenarios, using a worse verification method — or using none at all — has comparable performance to using the best verification method, a result that we attribute to properties of the datasets.
%R 10.18653/v1/2022.findings-acl.296
%U https://aclanthology.org/2022.findings-acl.296
%U https://doi.org/10.18653/v1/2022.findings-acl.296
%P 3759-3765
Markdown (Informal)
[Benchmarking Answer Verification Methods for Question Answering-Based Summarization Evaluation Metrics](https://aclanthology.org/2022.findings-acl.296) (Deutsch & Roth, Findings 2022)
ACL