@inproceedings{anschutz-etal-2023-correct,
title = "This is not correct! Negation-aware Evaluation of Language Generation Systems",
author = {Ansch{\"u}tz, Miriam and
Miguel Lozano, Diego and
Groh, Georg},
editor = "Keet, C. Maria and
Lee, Hung-Yi and
Zarrie{\ss}, Sina",
booktitle = "Proceedings of the 16th International Natural Language Generation Conference",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.inlg-main.12",
doi = "10.18653/v1/2023.inlg-main.12",
pages = "163--175",
abstract = "Large language models underestimate the impact of negations on how much they change the meaning of a sentence. Therefore, learned evaluation metrics based on these models are insensitive to negations. In this paper, we propose NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that, we designed a rule-based sentence negation tool and used it to create the CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a sentence transformer and an evaluation metric to improve their negation sensitivity. Evaluating these models on existing benchmarks shows that our fine-tuned models outperform existing metrics on the negated sentences by far while preserving their base models{'} performances on other perturbations.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anschutz-etal-2023-correct">
<titleInfo>
<title>This is not correct! Negation-aware Evaluation of Language Generation Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miriam</namePart>
<namePart type="family">Anschütz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Miguel Lozano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Groh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">C</namePart>
<namePart type="given">Maria</namePart>
<namePart type="family">Keet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-Yi</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Zarrieß</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czechia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models underestimate the impact of negations on how much they change the meaning of a sentence. Therefore, learned evaluation metrics based on these models are insensitive to negations. In this paper, we propose NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that, we designed a rule-based sentence negation tool and used it to create the CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a sentence transformer and an evaluation metric to improve their negation sensitivity. Evaluating these models on existing benchmarks shows that our fine-tuned models outperform existing metrics on the negated sentences by far while preserving their base models’ performances on other perturbations.</abstract>
<identifier type="citekey">anschutz-etal-2023-correct</identifier>
<identifier type="doi">10.18653/v1/2023.inlg-main.12</identifier>
<location>
<url>https://aclanthology.org/2023.inlg-main.12</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>163</start>
<end>175</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T This is not correct! Negation-aware Evaluation of Language Generation Systems
%A Anschütz, Miriam
%A Miguel Lozano, Diego
%A Groh, Georg
%Y Keet, C. Maria
%Y Lee, Hung-Yi
%Y Zarrieß, Sina
%S Proceedings of the 16th International Natural Language Generation Conference
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czechia
%F anschutz-etal-2023-correct
%X Large language models underestimate the impact of negations on how much they change the meaning of a sentence. Therefore, learned evaluation metrics based on these models are insensitive to negations. In this paper, we propose NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that, we designed a rule-based sentence negation tool and used it to create the CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a sentence transformer and an evaluation metric to improve their negation sensitivity. Evaluating these models on existing benchmarks shows that our fine-tuned models outperform existing metrics on the negated sentences by far while preserving their base models’ performances on other perturbations.
%R 10.18653/v1/2023.inlg-main.12
%U https://aclanthology.org/2023.inlg-main.12
%U https://doi.org/10.18653/v1/2023.inlg-main.12
%P 163-175
Markdown (Informal)
[This is not correct! Negation-aware Evaluation of Language Generation Systems](https://aclanthology.org/2023.inlg-main.12) (Anschütz et al., INLG-SIGDIAL 2023)
ACL