@inproceedings{shi-etal-2025-llms,
title = "Do {LLM}s Behave as Claimed? Investigating How {LLM}s Follow Their Own Claims using Counterfactual Questions",
author = "Shi, Haochen and
Li, Shaobo and
Chao, Guoqing and
Shi, Xiaoliang and
Chen, Wentao and
Ji, Zhenzhou",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1479/",
pages = "29031--29044",
ISBN = "979-8-89176-332-6",
abstract = "Large Language Models (LLMs) require robust evaluation. However, existing frameworks often rely on curated datasets that, once public, may be accessed by newer LLMs. This creates a risk of data leakage, where test sets inadvertently become part of training data, compromising evaluation fairness and integrity. To mitigate this issue, we propose Behave as Claimed (BaC), a novel evaluation framework inspired by counterfactual reasoning. BaC constructs a ``what-if'' scenario where LLMs respond to counterfactual questions about how they would behave if the input were manipulated. We refer to these responses as claims, which are verifiable by observing the LLMs' actual behavior when given the manipulated input. BaC dynamically generates and verifies counterfactual questions using various few-shot in-context learning evaluation datasets, reducing their susceptibility to data leakage. Moreover, BaC provides a more challenging evaluation paradigm for LLMs. LLMs must thoroughly understand the prompt, the task, and the consequences of their responses to achieve better performance. We evaluate several state-of-the-art LLMs and find that, while most perform well on the original datasets, they struggle with BaC. This suggests that LLMs usually fail to align their claims with their actual behavior and that high performance on standard datasets may be less stable than previously assumed."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2025-llms">
<titleInfo>
<title>Do LLMs Behave as Claimed? Investigating How LLMs Follow Their Own Claims using Counterfactual Questions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haochen</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaobo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guoqing</namePart>
<namePart type="family">Chao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoliang</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wentao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenzhou</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) require robust evaluation. However, existing frameworks often rely on curated datasets that, once public, may be accessed by newer LLMs. This creates a risk of data leakage, where test sets inadvertently become part of training data, compromising evaluation fairness and integrity. To mitigate this issue, we propose Behave as Claimed (BaC), a novel evaluation framework inspired by counterfactual reasoning. BaC constructs a “what-if” scenario where LLMs respond to counterfactual questions about how they would behave if the input were manipulated. We refer to these responses as claims, which are verifiable by observing the LLMs’ actual behavior when given the manipulated input. BaC dynamically generates and verifies counterfactual questions using various few-shot in-context learning evaluation datasets, reducing their susceptibility to data leakage. Moreover, BaC provides a more challenging evaluation paradigm for LLMs. LLMs must thoroughly understand the prompt, the task, and the consequences of their responses to achieve better performance. We evaluate several state-of-the-art LLMs and find that, while most perform well on the original datasets, they struggle with BaC. This suggests that LLMs usually fail to align their claims with their actual behavior and that high performance on standard datasets may be less stable than previously assumed.</abstract>
<identifier type="citekey">shi-etal-2025-llms</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1479/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>29031</start>
<end>29044</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do LLMs Behave as Claimed? Investigating How LLMs Follow Their Own Claims using Counterfactual Questions
%A Shi, Haochen
%A Li, Shaobo
%A Chao, Guoqing
%A Shi, Xiaoliang
%A Chen, Wentao
%A Ji, Zhenzhou
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F shi-etal-2025-llms
%X Large Language Models (LLMs) require robust evaluation. However, existing frameworks often rely on curated datasets that, once public, may be accessed by newer LLMs. This creates a risk of data leakage, where test sets inadvertently become part of training data, compromising evaluation fairness and integrity. To mitigate this issue, we propose Behave as Claimed (BaC), a novel evaluation framework inspired by counterfactual reasoning. BaC constructs a “what-if” scenario where LLMs respond to counterfactual questions about how they would behave if the input were manipulated. We refer to these responses as claims, which are verifiable by observing the LLMs’ actual behavior when given the manipulated input. BaC dynamically generates and verifies counterfactual questions using various few-shot in-context learning evaluation datasets, reducing their susceptibility to data leakage. Moreover, BaC provides a more challenging evaluation paradigm for LLMs. LLMs must thoroughly understand the prompt, the task, and the consequences of their responses to achieve better performance. We evaluate several state-of-the-art LLMs and find that, while most perform well on the original datasets, they struggle with BaC. This suggests that LLMs usually fail to align their claims with their actual behavior and that high performance on standard datasets may be less stable than previously assumed.
%U https://aclanthology.org/2025.emnlp-main.1479/
%P 29031-29044
Markdown (Informal)
[Do LLMs Behave as Claimed? Investigating How LLMs Follow Their Own Claims using Counterfactual Questions](https://aclanthology.org/2025.emnlp-main.1479/) (Shi et al., EMNLP 2025)
ACL