@inproceedings{logacheva-etal-2022-study,
title = "A Study on Manual and Automatic Evaluation for Text Style Transfer: The Case of Detoxification",
author = "Logacheva, Varvara and
Dementieva, Daryna and
Krotova, Irina and
Fenogenova, Alena and
Nikishina, Irina and
Shavrina, Tatiana and
Panchenko, Alexander",
editor = "Belz, Anya and
Popovi{\'c}, Maja and
Reiter, Ehud and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.humeval-1.8",
doi = "10.18653/v1/2022.humeval-1.8",
pages = "90--101",
abstract = "It is often difficult to reliably evaluate models which generate text. Among them, text style transfer is a particularly difficult to evaluate, because its success depends on a number of parameters. We conduct an evaluation of a large number of models on a detoxification task. We explore the relations between the manual and automatic metrics and find that there is only weak correlation between them, which is dependent on the type of model which generated text. Automatic metrics tend to be less reliable for better-performing models. However, our findings suggest that, ChrF and BertScore metrics can be used as a proxy for human evaluation of text detoxification to some extent.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="logacheva-etal-2022-study">
<titleInfo>
<title>A Study on Manual and Automatic Evaluation for Text Style Transfer: The Case of Detoxification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Varvara</namePart>
<namePart type="family">Logacheva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Krotova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alena</namePart>
<namePart type="family">Fenogenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Nikishina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Panchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It is often difficult to reliably evaluate models which generate text. Among them, text style transfer is a particularly difficult to evaluate, because its success depends on a number of parameters. We conduct an evaluation of a large number of models on a detoxification task. We explore the relations between the manual and automatic metrics and find that there is only weak correlation between them, which is dependent on the type of model which generated text. Automatic metrics tend to be less reliable for better-performing models. However, our findings suggest that, ChrF and BertScore metrics can be used as a proxy for human evaluation of text detoxification to some extent.</abstract>
<identifier type="citekey">logacheva-etal-2022-study</identifier>
<identifier type="doi">10.18653/v1/2022.humeval-1.8</identifier>
<location>
<url>https://aclanthology.org/2022.humeval-1.8</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>90</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Study on Manual and Automatic Evaluation for Text Style Transfer: The Case of Detoxification
%A Logacheva, Varvara
%A Dementieva, Daryna
%A Krotova, Irina
%A Fenogenova, Alena
%A Nikishina, Irina
%A Shavrina, Tatiana
%A Panchenko, Alexander
%Y Belz, Anya
%Y Popović, Maja
%Y Reiter, Ehud
%Y Shimorina, Anastasia
%S Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F logacheva-etal-2022-study
%X It is often difficult to reliably evaluate models which generate text. Among them, text style transfer is a particularly difficult to evaluate, because its success depends on a number of parameters. We conduct an evaluation of a large number of models on a detoxification task. We explore the relations between the manual and automatic metrics and find that there is only weak correlation between them, which is dependent on the type of model which generated text. Automatic metrics tend to be less reliable for better-performing models. However, our findings suggest that, ChrF and BertScore metrics can be used as a proxy for human evaluation of text detoxification to some extent.
%R 10.18653/v1/2022.humeval-1.8
%U https://aclanthology.org/2022.humeval-1.8
%U https://doi.org/10.18653/v1/2022.humeval-1.8
%P 90-101
Markdown (Informal)
[A Study on Manual and Automatic Evaluation for Text Style Transfer: The Case of Detoxification](https://aclanthology.org/2022.humeval-1.8) (Logacheva et al., HumEval 2022)
ACL