@inproceedings{bailleux-etal-2025-connecting,
title = "Connecting Concept Layers and Rationales to Enhance Language Model Interpretability",
author = "Bailleux, Thomas and
Mukherjee, Tanmoy and
Marquis, Pierre and
Bouraoui, Zied",
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.starsem-1.33/",
pages = "409--429",
ISBN = "979-8-89176-340-1",
abstract = "With the introduction of large language models, NLP has undergone a paradigm shift where these models now serve as the backbone of most developed systems. However, while highly effective, they remain opaque and difficult to interpret, which limits their adoption in critical applications that require transparency and trust. Two major approaches aim to address this: rationale extraction, which highlights input spans that justify predictions, and concept bottleneck models, which make decisions through human-interpretable concepts. Yet each has limitations. Crucially, current models lack a unified framework that connects where a model looks (rationales) with why it makes a decision (concepts). We introduce CLARITY, a model that first selects key input spans, maps them to interpretable concepts, and then predicts using only those concepts. This design supports faithful, multi-level explanations and allows users to intervene at both the rationale and concept levels. CLARITY, achieves competitive accuracy while offering improved transparency and controllability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bailleux-etal-2025-connecting">
<titleInfo>
<title>Connecting Concept Layers and Rationales to Enhance Language Model Interpretability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Bailleux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Marquis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zied</namePart>
<namePart type="family">Bouraoui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Frermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Stevenson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-340-1</identifier>
</relatedItem>
<abstract>With the introduction of large language models, NLP has undergone a paradigm shift where these models now serve as the backbone of most developed systems. However, while highly effective, they remain opaque and difficult to interpret, which limits their adoption in critical applications that require transparency and trust. Two major approaches aim to address this: rationale extraction, which highlights input spans that justify predictions, and concept bottleneck models, which make decisions through human-interpretable concepts. Yet each has limitations. Crucially, current models lack a unified framework that connects where a model looks (rationales) with why it makes a decision (concepts). We introduce CLARITY, a model that first selects key input spans, maps them to interpretable concepts, and then predicts using only those concepts. This design supports faithful, multi-level explanations and allows users to intervene at both the rationale and concept levels. CLARITY, achieves competitive accuracy while offering improved transparency and controllability.</abstract>
<identifier type="citekey">bailleux-etal-2025-connecting</identifier>
<location>
<url>https://aclanthology.org/2025.starsem-1.33/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>409</start>
<end>429</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Connecting Concept Layers and Rationales to Enhance Language Model Interpretability
%A Bailleux, Thomas
%A Mukherjee, Tanmoy
%A Marquis, Pierre
%A Bouraoui, Zied
%Y Frermann, Lea
%Y Stevenson, Mark
%S Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-340-1
%F bailleux-etal-2025-connecting
%X With the introduction of large language models, NLP has undergone a paradigm shift where these models now serve as the backbone of most developed systems. However, while highly effective, they remain opaque and difficult to interpret, which limits their adoption in critical applications that require transparency and trust. Two major approaches aim to address this: rationale extraction, which highlights input spans that justify predictions, and concept bottleneck models, which make decisions through human-interpretable concepts. Yet each has limitations. Crucially, current models lack a unified framework that connects where a model looks (rationales) with why it makes a decision (concepts). We introduce CLARITY, a model that first selects key input spans, maps them to interpretable concepts, and then predicts using only those concepts. This design supports faithful, multi-level explanations and allows users to intervene at both the rationale and concept levels. CLARITY, achieves competitive accuracy while offering improved transparency and controllability.
%U https://aclanthology.org/2025.starsem-1.33/
%P 409-429
Markdown (Informal)
[Connecting Concept Layers and Rationales to Enhance Language Model Interpretability](https://aclanthology.org/2025.starsem-1.33/) (Bailleux et al., *SEM 2025)
ACL