@inproceedings{dehghanighobadi-etal-2025-llms,
title = "Can {LLM}s Explain Themselves Counterfactually?",
author = "Dehghanighobadi, Zahra and
Fischer, Asja and
Zafar, Muhammad Bilal",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.396/",
pages = "7798--7826",
ISBN = "979-8-89176-332-6",
abstract = "Explanations are an important tool for gaining insights into model behavior, calibrating user trust, and ensuring compliance.Past few years have seen a flurry of methods for generating explanations, many of which involve computing model gradients or solving specially designed optimization problems.Owing to the remarkable reasoning abilities of LLMs, *self-explanation*, i.e., prompting the model to explain its outputs has recently emerged as a new paradigm.We study a specific type of self-explanations, *self-generated counterfactual explanations* (SCEs).We test LLMs' ability to generate SCEs across families, sizes, temperatures, and datasets. We find that LLMs sometimes struggle to generate SCEs. When they do, their prediction often does not agree with their own counterfactual reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dehghanighobadi-etal-2025-llms">
<titleInfo>
<title>Can LLMs Explain Themselves Counterfactually?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zahra</namePart>
<namePart type="family">Dehghanighobadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asja</namePart>
<namePart type="family">Fischer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="given">Bilal</namePart>
<namePart type="family">Zafar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Explanations are an important tool for gaining insights into model behavior, calibrating user trust, and ensuring compliance.Past few years have seen a flurry of methods for generating explanations, many of which involve computing model gradients or solving specially designed optimization problems.Owing to the remarkable reasoning abilities of LLMs, *self-explanation*, i.e., prompting the model to explain its outputs has recently emerged as a new paradigm.We study a specific type of self-explanations, *self-generated counterfactual explanations* (SCEs).We test LLMs’ ability to generate SCEs across families, sizes, temperatures, and datasets. We find that LLMs sometimes struggle to generate SCEs. When they do, their prediction often does not agree with their own counterfactual reasoning.</abstract>
<identifier type="citekey">dehghanighobadi-etal-2025-llms</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.396/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>7798</start>
<end>7826</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can LLMs Explain Themselves Counterfactually?
%A Dehghanighobadi, Zahra
%A Fischer, Asja
%A Zafar, Muhammad Bilal
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F dehghanighobadi-etal-2025-llms
%X Explanations are an important tool for gaining insights into model behavior, calibrating user trust, and ensuring compliance.Past few years have seen a flurry of methods for generating explanations, many of which involve computing model gradients or solving specially designed optimization problems.Owing to the remarkable reasoning abilities of LLMs, *self-explanation*, i.e., prompting the model to explain its outputs has recently emerged as a new paradigm.We study a specific type of self-explanations, *self-generated counterfactual explanations* (SCEs).We test LLMs’ ability to generate SCEs across families, sizes, temperatures, and datasets. We find that LLMs sometimes struggle to generate SCEs. When they do, their prediction often does not agree with their own counterfactual reasoning.
%U https://aclanthology.org/2025.emnlp-main.396/
%P 7798-7826
Markdown (Informal)
[Can LLMs Explain Themselves Counterfactually?](https://aclanthology.org/2025.emnlp-main.396/) (Dehghanighobadi et al., EMNLP 2025)
ACL
- Zahra Dehghanighobadi, Asja Fischer, and Muhammad Bilal Zafar. 2025. Can LLMs Explain Themselves Counterfactually?. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 7798–7826, Suzhou, China. Association for Computational Linguistics.