@inproceedings{siegel-etal-2024-probabilities,
title = "The Probabilities Also Matter: A More Faithful Metric for Faithfulness of Free-Text Explanations in Large Language Models",
author = "Siegel, Noah and
Camburu, Oana-Maria and
Heess, Nicolas and
Perez-Ortiz, Maria",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-short.49",
doi = "10.18653/v1/2024.acl-short.49",
pages = "530--546",
abstract = "In order to oversee advanced AI systems, it is important to understand their reasons for generating a given output. When prompted, large language models (LLMs) can provide natural language explanations or reasoning traces that sound plausible and receive high ratings from human annotators. However, it is unclear to what extent these explanations are truly capturing the factors responsible for the model{'}s predictions: the most {``}human-like{''} explanation may be different from the one that is most faithful to the model{'}s true decision making process. In this work, we introduce the correlational counterfactual test (CCT), a faithfulness metric based on counterfactual input edits that takes into account not just the binary label change, but the total shift in the model{'}s predicted label distribution. We evaluate the faithfulness of free-text explanations generated by few-shot-prompted LLMs from the Llama-2 family on three NLP tasks. We find that these explanations are indeed more likely to mention factors when they are impactful to the model{'}s prediction, with the degree of association increasing with model size but varying significantly by task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siegel-etal-2024-probabilities">
<titleInfo>
<title>The Probabilities Also Matter: A More Faithful Metric for Faithfulness of Free-Text Explanations in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Siegel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana-Maria</namePart>
<namePart type="family">Camburu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Heess</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Perez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In order to oversee advanced AI systems, it is important to understand their reasons for generating a given output. When prompted, large language models (LLMs) can provide natural language explanations or reasoning traces that sound plausible and receive high ratings from human annotators. However, it is unclear to what extent these explanations are truly capturing the factors responsible for the model’s predictions: the most “human-like” explanation may be different from the one that is most faithful to the model’s true decision making process. In this work, we introduce the correlational counterfactual test (CCT), a faithfulness metric based on counterfactual input edits that takes into account not just the binary label change, but the total shift in the model’s predicted label distribution. We evaluate the faithfulness of free-text explanations generated by few-shot-prompted LLMs from the Llama-2 family on three NLP tasks. We find that these explanations are indeed more likely to mention factors when they are impactful to the model’s prediction, with the degree of association increasing with model size but varying significantly by task.</abstract>
<identifier type="citekey">siegel-etal-2024-probabilities</identifier>
<identifier type="doi">10.18653/v1/2024.acl-short.49</identifier>
<location>
<url>https://aclanthology.org/2024.acl-short.49</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>530</start>
<end>546</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Probabilities Also Matter: A More Faithful Metric for Faithfulness of Free-Text Explanations in Large Language Models
%A Siegel, Noah
%A Camburu, Oana-Maria
%A Heess, Nicolas
%A Perez-Ortiz, Maria
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F siegel-etal-2024-probabilities
%X In order to oversee advanced AI systems, it is important to understand their reasons for generating a given output. When prompted, large language models (LLMs) can provide natural language explanations or reasoning traces that sound plausible and receive high ratings from human annotators. However, it is unclear to what extent these explanations are truly capturing the factors responsible for the model’s predictions: the most “human-like” explanation may be different from the one that is most faithful to the model’s true decision making process. In this work, we introduce the correlational counterfactual test (CCT), a faithfulness metric based on counterfactual input edits that takes into account not just the binary label change, but the total shift in the model’s predicted label distribution. We evaluate the faithfulness of free-text explanations generated by few-shot-prompted LLMs from the Llama-2 family on three NLP tasks. We find that these explanations are indeed more likely to mention factors when they are impactful to the model’s prediction, with the degree of association increasing with model size but varying significantly by task.
%R 10.18653/v1/2024.acl-short.49
%U https://aclanthology.org/2024.acl-short.49
%U https://doi.org/10.18653/v1/2024.acl-short.49
%P 530-546
Markdown (Informal)
[The Probabilities Also Matter: A More Faithful Metric for Faithfulness of Free-Text Explanations in Large Language Models](https://aclanthology.org/2024.acl-short.49) (Siegel et al., ACL 2024)
ACL