@inproceedings{vallejo-etal-2022-evaluating,
title = "Evaluating the Examiner: The Perils of {P}earson Correlation for Validating Text Similarity Metrics",
author = "Vallejo, Gisela and
Baldwin, Timothy and
Frermann, Lea",
editor = "Parameswaran, Pradeesh and
Biggs, Jennifer and
Powers, David",
booktitle = "Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2022",
address = "Adelaide, Australia",
publisher = "Australasian Language Technology Association",
url = "https://aclanthology.org/2022.alta-1.18/",
pages = "130--138",
abstract = "In recent years, researchers have developed question-answering based approaches to automatically evaluate system summaries, reporting improved validity compared to word overlap-based metrics like ROUGE, in terms of correlation with human ratings of criteria including fluency and hallucination. In this paper, we take a closer look at one particular metric, QuestEval, and ask whether: (1) it can serve as a more general metric for long document similarity assessment; and (2) a single correlation score between metric scores and human ratings, as the currently standard approach, is sufficient for metric validation. We find that correlation scores can be misleading, and that score distributions and outliers should be taken into account. With these caveats in mind, QuestEval can be a promising candidate for long document similarity assessment."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vallejo-etal-2022-evaluating">
<titleInfo>
<title>Evaluating the Examiner: The Perils of Pearson Correlation for Validating Text Similarity Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gisela</namePart>
<namePart type="family">Vallejo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Frermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pradeesh</namePart>
<namePart type="family">Parameswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Biggs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Powers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Australasian Language Technology Association</publisher>
<place>
<placeTerm type="text">Adelaide, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, researchers have developed question-answering based approaches to automatically evaluate system summaries, reporting improved validity compared to word overlap-based metrics like ROUGE, in terms of correlation with human ratings of criteria including fluency and hallucination. In this paper, we take a closer look at one particular metric, QuestEval, and ask whether: (1) it can serve as a more general metric for long document similarity assessment; and (2) a single correlation score between metric scores and human ratings, as the currently standard approach, is sufficient for metric validation. We find that correlation scores can be misleading, and that score distributions and outliers should be taken into account. With these caveats in mind, QuestEval can be a promising candidate for long document similarity assessment.</abstract>
<identifier type="citekey">vallejo-etal-2022-evaluating</identifier>
<location>
<url>https://aclanthology.org/2022.alta-1.18/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>130</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating the Examiner: The Perils of Pearson Correlation for Validating Text Similarity Metrics
%A Vallejo, Gisela
%A Baldwin, Timothy
%A Frermann, Lea
%Y Parameswaran, Pradeesh
%Y Biggs, Jennifer
%Y Powers, David
%S Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association
%D 2022
%8 December
%I Australasian Language Technology Association
%C Adelaide, Australia
%F vallejo-etal-2022-evaluating
%X In recent years, researchers have developed question-answering based approaches to automatically evaluate system summaries, reporting improved validity compared to word overlap-based metrics like ROUGE, in terms of correlation with human ratings of criteria including fluency and hallucination. In this paper, we take a closer look at one particular metric, QuestEval, and ask whether: (1) it can serve as a more general metric for long document similarity assessment; and (2) a single correlation score between metric scores and human ratings, as the currently standard approach, is sufficient for metric validation. We find that correlation scores can be misleading, and that score distributions and outliers should be taken into account. With these caveats in mind, QuestEval can be a promising candidate for long document similarity assessment.
%U https://aclanthology.org/2022.alta-1.18/
%P 130-138
Markdown (Informal)
[Evaluating the Examiner: The Perils of Pearson Correlation for Validating Text Similarity Metrics](https://aclanthology.org/2022.alta-1.18/) (Vallejo et al., ALTA 2022)
ACL