@inproceedings{mukherjee-etal-2025-evaluating,
title = "Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics?",
author = "Mukherjee, Sourabrata and
Ojha, Atul Kr. and
McCrae, John Philip and
Dusek, Ondrej",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-srw.41/",
doi = "10.18653/v1/2025.naacl-srw.41",
pages = "418--434",
ISBN = "979-8-89176-192-6",
abstract = "Text style transfer (TST) is the task of transforming a text to reflect a particular style while preserving its original content. Evaluating TSToutputs is a multidimensional challenge, requiring the assessment of style transfer accuracy, content preservation, and naturalness. Us-ing human evaluation is ideal but costly, as is common in other natural language processing (NLP) tasks; however, automatic metrics forTST have not received as much attention as metrics for, e.g., machine translation or summarization. In this paper, we examine both set ofexisting and novel metrics from broader NLP tasks for TST evaluation, focusing on two popular subtasks{---}sentiment transfer and detoxification{---}in a multilingual context comprising English, Hindi, and Bengali. By conducting meta-evaluation through correlation with hu-man judgments, we demonstrate the effectiveness of these metrics when used individually and in ensembles. Additionally, we investigatethe potential of large language models (LLMs) as tools for TST evaluation. Our findings highlight newly applied advanced NLP metrics andLLM-based evaluations provide better insights than existing TST metrics. Our oracle ensemble approaches show even more potential."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mukherjee-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sourabrata</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">Philip</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Dusek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sammar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Leonor Pacheco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shira</namePart>
<namePart type="family">Wein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-192-6</identifier>
</relatedItem>
<abstract>Text style transfer (TST) is the task of transforming a text to reflect a particular style while preserving its original content. Evaluating TSToutputs is a multidimensional challenge, requiring the assessment of style transfer accuracy, content preservation, and naturalness. Us-ing human evaluation is ideal but costly, as is common in other natural language processing (NLP) tasks; however, automatic metrics forTST have not received as much attention as metrics for, e.g., machine translation or summarization. In this paper, we examine both set ofexisting and novel metrics from broader NLP tasks for TST evaluation, focusing on two popular subtasks—sentiment transfer and detoxification—in a multilingual context comprising English, Hindi, and Bengali. By conducting meta-evaluation through correlation with hu-man judgments, we demonstrate the effectiveness of these metrics when used individually and in ensembles. Additionally, we investigatethe potential of large language models (LLMs) as tools for TST evaluation. Our findings highlight newly applied advanced NLP metrics andLLM-based evaluations provide better insights than existing TST metrics. Our oracle ensemble approaches show even more potential.</abstract>
<identifier type="citekey">mukherjee-etal-2025-evaluating</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-srw.41</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-srw.41/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>418</start>
<end>434</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics?
%A Mukherjee, Sourabrata
%A Ojha, Atul Kr.
%A McCrae, John Philip
%A Dusek, Ondrej
%Y Ebrahimi, Abteen
%Y Haider, Samar
%Y Liu, Emmy
%Y Haider, Sammar
%Y Leonor Pacheco, Maria
%Y Wein, Shira
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-192-6
%F mukherjee-etal-2025-evaluating
%X Text style transfer (TST) is the task of transforming a text to reflect a particular style while preserving its original content. Evaluating TSToutputs is a multidimensional challenge, requiring the assessment of style transfer accuracy, content preservation, and naturalness. Us-ing human evaluation is ideal but costly, as is common in other natural language processing (NLP) tasks; however, automatic metrics forTST have not received as much attention as metrics for, e.g., machine translation or summarization. In this paper, we examine both set ofexisting and novel metrics from broader NLP tasks for TST evaluation, focusing on two popular subtasks—sentiment transfer and detoxification—in a multilingual context comprising English, Hindi, and Bengali. By conducting meta-evaluation through correlation with hu-man judgments, we demonstrate the effectiveness of these metrics when used individually and in ensembles. Additionally, we investigatethe potential of large language models (LLMs) as tools for TST evaluation. Our findings highlight newly applied advanced NLP metrics andLLM-based evaluations provide better insights than existing TST metrics. Our oracle ensemble approaches show even more potential.
%R 10.18653/v1/2025.naacl-srw.41
%U https://aclanthology.org/2025.naacl-srw.41/
%U https://doi.org/10.18653/v1/2025.naacl-srw.41
%P 418-434
Markdown (Informal)
[Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics?](https://aclanthology.org/2025.naacl-srw.41/) (Mukherjee et al., NAACL 2025)
ACL
- Sourabrata Mukherjee, Atul Kr. Ojha, John Philip McCrae, and Ondrej Dusek. 2025. Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics?. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 418–434, Albuquerque, USA. Association for Computational Linguistics.