@inproceedings{li-etal-2026-born,
title = "Born Pragmatic, Trained to Hallucinate? Quantifying the Origins of Contextual Bias in {LLM}s via the {P}a{CE} Benchmark",
author = "Li, Ziming and
Tian, Yu and
Lan, Tian and
Li, Jiang and
Duo, Zehua and
Gao, Guanglai and
Su, Xiangdong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.959/",
pages = "19211--19235",
ISBN = "979-8-89176-395-1",
abstract = "While Large Language Models (LLMs) excel at capturing communicative intent, this capability introduces a side effect: Pragmatic Hallucination, where models over-interpret literal contexts to generate non-factual inferences. To quantify this, we introduce the PaCE (Pragmatics-as-Context Evaluation) benchmark, comprising over 3,000 manually verified ``context-flip'' samples. Evaluations across nine mainstream models reveal a significant Context Sensitivity Gap (CSG), with literal accuracy consistently lagging behind pragmatic reasoning. Attribution analysis indicates that Reinforcement Learning from Human Feedback (RLHF) exacerbates this bias, and neither parameter scaling nor Chain-of-Thought (CoT) fully mitigates it. Crucially, ``Strict Prompting'' effectively reverses the CSG, demonstrating that the phenomenon stems from behavioral lock-in during training rather than inherent capability deficiencies. Furthermore, error patterns exhibit high systematic correlation across diverse architectures. This study highlights that current alignment paradigms lack precise control over pragmatic boundaries, underscoring the necessity for a ``Literal Grounding'' mechanism in future safety frameworks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-born">
<titleInfo>
<title>Born Pragmatic, Trained to Hallucinate? Quantifying the Origins of Contextual Bias in LLMs via the PaCE Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ziming</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tian</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zehua</namePart>
<namePart type="family">Duo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanglai</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangdong</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>While Large Language Models (LLMs) excel at capturing communicative intent, this capability introduces a side effect: Pragmatic Hallucination, where models over-interpret literal contexts to generate non-factual inferences. To quantify this, we introduce the PaCE (Pragmatics-as-Context Evaluation) benchmark, comprising over 3,000 manually verified “context-flip” samples. Evaluations across nine mainstream models reveal a significant Context Sensitivity Gap (CSG), with literal accuracy consistently lagging behind pragmatic reasoning. Attribution analysis indicates that Reinforcement Learning from Human Feedback (RLHF) exacerbates this bias, and neither parameter scaling nor Chain-of-Thought (CoT) fully mitigates it. Crucially, “Strict Prompting” effectively reverses the CSG, demonstrating that the phenomenon stems from behavioral lock-in during training rather than inherent capability deficiencies. Furthermore, error patterns exhibit high systematic correlation across diverse architectures. This study highlights that current alignment paradigms lack precise control over pragmatic boundaries, underscoring the necessity for a “Literal Grounding” mechanism in future safety frameworks.</abstract>
<identifier type="citekey">li-etal-2026-born</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.959/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>19211</start>
<end>19235</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Born Pragmatic, Trained to Hallucinate? Quantifying the Origins of Contextual Bias in LLMs via the PaCE Benchmark
%A Li, Ziming
%A Tian, Yu
%A Lan, Tian
%A Li, Jiang
%A Duo, Zehua
%A Gao, Guanglai
%A Su, Xiangdong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-born
%X While Large Language Models (LLMs) excel at capturing communicative intent, this capability introduces a side effect: Pragmatic Hallucination, where models over-interpret literal contexts to generate non-factual inferences. To quantify this, we introduce the PaCE (Pragmatics-as-Context Evaluation) benchmark, comprising over 3,000 manually verified “context-flip” samples. Evaluations across nine mainstream models reveal a significant Context Sensitivity Gap (CSG), with literal accuracy consistently lagging behind pragmatic reasoning. Attribution analysis indicates that Reinforcement Learning from Human Feedback (RLHF) exacerbates this bias, and neither parameter scaling nor Chain-of-Thought (CoT) fully mitigates it. Crucially, “Strict Prompting” effectively reverses the CSG, demonstrating that the phenomenon stems from behavioral lock-in during training rather than inherent capability deficiencies. Furthermore, error patterns exhibit high systematic correlation across diverse architectures. This study highlights that current alignment paradigms lack precise control over pragmatic boundaries, underscoring the necessity for a “Literal Grounding” mechanism in future safety frameworks.
%U https://aclanthology.org/2026.findings-acl.959/
%P 19211-19235
Markdown (Informal)
[Born Pragmatic, Trained to Hallucinate? Quantifying the Origins of Contextual Bias in LLMs via the PaCE Benchmark](https://aclanthology.org/2026.findings-acl.959/) (Li et al., Findings 2026)
ACL
- Ziming Li, Yu Tian, Tian Lan, Jiang Li, Zehua Duo, Guanglai Gao, and Xiangdong Su. 2026. Born Pragmatic, Trained to Hallucinate? Quantifying the Origins of Contextual Bias in LLMs via the PaCE Benchmark. In Findings of the Association for Computational Linguistics: ACL 2026, pages 19211–19235, San Diego, California, United States. Association for Computational Linguistics.