@inproceedings{brandl-eberle-2026-systematic,
title = "A Systematic Comparison between Extractive Self-Explanations and Human Rationales in Text Classification",
author = "Brandl, Stephanie and
Eberle, Oliver",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.trustnlp-main.44/",
pages = "563--583",
ISBN = "979-8-89176-418-7",
abstract = "Instruction-tuned LLMs are able to provide *an* explanation about their output to users by generating self-explanations, without requiring the application of complex interpretability techniques. In this paper, we analyse whether this ability results in a *good* explanation. We evaluate self-explanations in the form of input rationales with respect to their plausibility to humans. We study three text classification tasks: sentiment classification, forced labour detection and claim verification. We include Danish and Italian translations of the sentiment classification task and compare self-explanations to human annotations. For this, we collected human rationale annotations for Climate-Fever, a claim verification dataset. We furthermore evaluate the faithfulness of human and self-explanation rationales with respect to correct model predictions, and extend the study by incorporating post-hoc attribution-based explanations. We analyse four open-weight LLMs and find that alignment between self-explanations and human rationales highly depends on text length and task complexity. Nevertheless, self-explanations yield faithful subsets of token-level rationales, whereas post-hoc attribution methods tend to emphasize structural and formatting tokens, reflecting fundamentally different explanation strategies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brandl-eberle-2026-systematic">
<titleInfo>
<title>A Systematic Comparison between Extractive Self-Explanations and Human Rationales in Text Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stephanie</namePart>
<namePart type="family">Brandl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Eberle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-418-7</identifier>
</relatedItem>
<abstract>Instruction-tuned LLMs are able to provide *an* explanation about their output to users by generating self-explanations, without requiring the application of complex interpretability techniques. In this paper, we analyse whether this ability results in a *good* explanation. We evaluate self-explanations in the form of input rationales with respect to their plausibility to humans. We study three text classification tasks: sentiment classification, forced labour detection and claim verification. We include Danish and Italian translations of the sentiment classification task and compare self-explanations to human annotations. For this, we collected human rationale annotations for Climate-Fever, a claim verification dataset. We furthermore evaluate the faithfulness of human and self-explanation rationales with respect to correct model predictions, and extend the study by incorporating post-hoc attribution-based explanations. We analyse four open-weight LLMs and find that alignment between self-explanations and human rationales highly depends on text length and task complexity. Nevertheless, self-explanations yield faithful subsets of token-level rationales, whereas post-hoc attribution methods tend to emphasize structural and formatting tokens, reflecting fundamentally different explanation strategies.</abstract>
<identifier type="citekey">brandl-eberle-2026-systematic</identifier>
<location>
<url>https://aclanthology.org/2026.trustnlp-main.44/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>563</start>
<end>583</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Systematic Comparison between Extractive Self-Explanations and Human Rationales in Text Classification
%A Brandl, Stephanie
%A Eberle, Oliver
%Y Chang, Kai-Wei
%Y Mehrabi, Ninareh
%Y Krishna, Satyapriya
%Y Das, Anubrata
%Y Dhamala, Jwala
%Y Cao, Yang Trista
%Y Kumarage, Tharindu
%Y Ramakrishna, Anil
%Y Christodoulopoulos, Christos
%Y Wan, Yixin
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 6th Workshop on Trustworthy NLP (TrustNLP 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-418-7
%F brandl-eberle-2026-systematic
%X Instruction-tuned LLMs are able to provide *an* explanation about their output to users by generating self-explanations, without requiring the application of complex interpretability techniques. In this paper, we analyse whether this ability results in a *good* explanation. We evaluate self-explanations in the form of input rationales with respect to their plausibility to humans. We study three text classification tasks: sentiment classification, forced labour detection and claim verification. We include Danish and Italian translations of the sentiment classification task and compare self-explanations to human annotations. For this, we collected human rationale annotations for Climate-Fever, a claim verification dataset. We furthermore evaluate the faithfulness of human and self-explanation rationales with respect to correct model predictions, and extend the study by incorporating post-hoc attribution-based explanations. We analyse four open-weight LLMs and find that alignment between self-explanations and human rationales highly depends on text length and task complexity. Nevertheless, self-explanations yield faithful subsets of token-level rationales, whereas post-hoc attribution methods tend to emphasize structural and formatting tokens, reflecting fundamentally different explanation strategies.
%U https://aclanthology.org/2026.trustnlp-main.44/
%P 563-583
Markdown (Informal)
[A Systematic Comparison between Extractive Self-Explanations and Human Rationales in Text Classification](https://aclanthology.org/2026.trustnlp-main.44/) (Brandl & Eberle, TrustNLP 2026)
ACL