@inproceedings{dhar-etal-2026-evaluating,
title = "Evaluating Adjective-Noun Compositionality in {LLM}s: Functional vs Representational Perspectives",
author = "Dhar, Ruchira and
Peng, Qiwei and
S{\o}gaard, Anders",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.starsem-conference.8/",
pages = "125--135",
ISBN = "979-8-89176-413-2",
abstract = "Compositionality is considered central to language abilities. As performant language systems, how do large language models (LLMs) do on compositional tasks? We evaluate adjective{--}noun compositionality in LLMs using two complementary setups: prompt-based functional assessment and a representational analysis of internal model states. Our results reveal a striking divergence between task performance and internal states. While LLMs reliably develop compositional representations, they fail to translate consistently into functional task success across model variants. Consequently, we highlight the importance of contrastive evaluation for obtaining a more complete understanding of model capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dhar-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Adjective-Noun Compositionality in LLMs: Functional vs Representational Perspectives</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruchira</namePart>
<namePart type="family">Dhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiwei</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saif</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nedjma</namePart>
<namePart type="family">Ousidhoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-413-2</identifier>
</relatedItem>
<abstract>Compositionality is considered central to language abilities. As performant language systems, how do large language models (LLMs) do on compositional tasks? We evaluate adjective–noun compositionality in LLMs using two complementary setups: prompt-based functional assessment and a representational analysis of internal model states. Our results reveal a striking divergence between task performance and internal states. While LLMs reliably develop compositional representations, they fail to translate consistently into functional task success across model variants. Consequently, we highlight the importance of contrastive evaluation for obtaining a more complete understanding of model capabilities.</abstract>
<identifier type="citekey">dhar-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.starsem-conference.8/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>125</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Adjective-Noun Compositionality in LLMs: Functional vs Representational Perspectives
%A Dhar, Ruchira
%A Peng, Qiwei
%A Søgaard, Anders
%Y Mohammad, Saif M.
%Y Ousidhoum, Nedjma
%S Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-413-2
%F dhar-etal-2026-evaluating
%X Compositionality is considered central to language abilities. As performant language systems, how do large language models (LLMs) do on compositional tasks? We evaluate adjective–noun compositionality in LLMs using two complementary setups: prompt-based functional assessment and a representational analysis of internal model states. Our results reveal a striking divergence between task performance and internal states. While LLMs reliably develop compositional representations, they fail to translate consistently into functional task success across model variants. Consequently, we highlight the importance of contrastive evaluation for obtaining a more complete understanding of model capabilities.
%U https://aclanthology.org/2026.starsem-conference.8/
%P 125-135
Markdown (Informal)
[Evaluating Adjective-Noun Compositionality in LLMs: Functional vs Representational Perspectives](https://aclanthology.org/2026.starsem-conference.8/) (Dhar et al., *SEM 2026)
ACL