@inproceedings{terentowicz-etal-2025-un,
title = "How (un)faithful are explainable {LLM}-based {NLG} metrics?",
author = "Terentowicz, Alex and
Lango, Mateusz and
Dusek, Ondrej",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.inlg-main.37/",
pages = "617--658",
abstract = "Explainable NLG metrics are becoming a popular research topic; however, the faithfulness of the explanations they provide is typically not evaluated. In this work, we propose a testbed for assessing the faithfulness of span-based metrics by performing controlled perturbations of their explanations and observing changes in the final score. We show that several popular LLM evaluators do not consistently produce faithful explanations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="terentowicz-etal-2025-un">
<titleInfo>
<title>How (un)faithful are explainable LLM-based NLG metrics?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Terentowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Lango</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Dusek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Narayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lê</namePart>
<namePart type="given">Hồng</namePart>
<namePart type="family">Phương</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hanoi, Vietnam</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Explainable NLG metrics are becoming a popular research topic; however, the faithfulness of the explanations they provide is typically not evaluated. In this work, we propose a testbed for assessing the faithfulness of span-based metrics by performing controlled perturbations of their explanations and observing changes in the final score. We show that several popular LLM evaluators do not consistently produce faithful explanations.</abstract>
<identifier type="citekey">terentowicz-etal-2025-un</identifier>
<location>
<url>https://aclanthology.org/2025.inlg-main.37/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>617</start>
<end>658</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How (un)faithful are explainable LLM-based NLG metrics?
%A Terentowicz, Alex
%A Lango, Mateusz
%A Dusek, Ondrej
%Y Flek, Lucie
%Y Narayan, Shashi
%Y Phương, Lê Hồng
%Y Pei, Jiahuan
%S Proceedings of the 18th International Natural Language Generation Conference
%D 2025
%8 October
%I Association for Computational Linguistics
%C Hanoi, Vietnam
%F terentowicz-etal-2025-un
%X Explainable NLG metrics are becoming a popular research topic; however, the faithfulness of the explanations they provide is typically not evaluated. In this work, we propose a testbed for assessing the faithfulness of span-based metrics by performing controlled perturbations of their explanations and observing changes in the final score. We show that several popular LLM evaluators do not consistently produce faithful explanations.
%U https://aclanthology.org/2025.inlg-main.37/
%P 617-658
Markdown (Informal)
[How (un)faithful are explainable LLM-based NLG metrics?](https://aclanthology.org/2025.inlg-main.37/) (Terentowicz et al., INLG 2025)
ACL