@inproceedings{liu-etal-2026-trustworthy,
title = "Trustworthy and Explainable Causal Representation Learning in Transformers",
author = "Liu, Yang and
Zhang, Yinghao and
Liu, Lin and
Li, Jiuyong and
Cheng, Debo and
Feng, Zaiwen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1368/",
pages = "27482--27501",
ISBN = "979-8-89176-395-1",
abstract = "A prevalent approach to interpretable representation learning involves creating a mask that weights the significance of each input feature, followed by deriving a masked representation by applying this mask to the input representation. However, the identifiability of these learned masked representations is often uncertain, making the origin of these representations ambiguous or unreliable. Furthermore, the approaches to interpreting Transformer based on attention weights have been criticized for their faithfulness. To address these limitations, we propose a novel causal framework that directly learns identifiable and explainable representations from attention weights, rather than relying on importance masks. Our framework leverages identifiability theory and causal representation learning to extract explainable representations within a subspace of input representations, effectively transforming frozen representation learning methods into self-explaining systems. Experimental results on real-world datasets demonstrate that, compared to well-established state-of-the-art methods, our approach provides identifiable and more trustworthy explanations while guaranteeing faithfulness."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-trustworthy">
<titleInfo>
<title>Trustworthy and Explainable Causal Representation Learning in Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinghao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiuyong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debo</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaiwen</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>A prevalent approach to interpretable representation learning involves creating a mask that weights the significance of each input feature, followed by deriving a masked representation by applying this mask to the input representation. However, the identifiability of these learned masked representations is often uncertain, making the origin of these representations ambiguous or unreliable. Furthermore, the approaches to interpreting Transformer based on attention weights have been criticized for their faithfulness. To address these limitations, we propose a novel causal framework that directly learns identifiable and explainable representations from attention weights, rather than relying on importance masks. Our framework leverages identifiability theory and causal representation learning to extract explainable representations within a subspace of input representations, effectively transforming frozen representation learning methods into self-explaining systems. Experimental results on real-world datasets demonstrate that, compared to well-established state-of-the-art methods, our approach provides identifiable and more trustworthy explanations while guaranteeing faithfulness.</abstract>
<identifier type="citekey">liu-etal-2026-trustworthy</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1368/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27482</start>
<end>27501</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Trustworthy and Explainable Causal Representation Learning in Transformers
%A Liu, Yang
%A Zhang, Yinghao
%A Liu, Lin
%A Li, Jiuyong
%A Cheng, Debo
%A Feng, Zaiwen
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F liu-etal-2026-trustworthy
%X A prevalent approach to interpretable representation learning involves creating a mask that weights the significance of each input feature, followed by deriving a masked representation by applying this mask to the input representation. However, the identifiability of these learned masked representations is often uncertain, making the origin of these representations ambiguous or unreliable. Furthermore, the approaches to interpreting Transformer based on attention weights have been criticized for their faithfulness. To address these limitations, we propose a novel causal framework that directly learns identifiable and explainable representations from attention weights, rather than relying on importance masks. Our framework leverages identifiability theory and causal representation learning to extract explainable representations within a subspace of input representations, effectively transforming frozen representation learning methods into self-explaining systems. Experimental results on real-world datasets demonstrate that, compared to well-established state-of-the-art methods, our approach provides identifiable and more trustworthy explanations while guaranteeing faithfulness.
%U https://aclanthology.org/2026.findings-acl.1368/
%P 27482-27501
Markdown (Informal)
[Trustworthy and Explainable Causal Representation Learning in Transformers](https://aclanthology.org/2026.findings-acl.1368/) (Liu et al., Findings 2026)
ACL