@inproceedings{kukreja-etal-2026-better,
title = "Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size",
author = "Kukreja, Dikshant and
Sah, Kshitij and
Gupta, Gautam and
Anand, Avinash and
Shah, Rajiv Ratn and
Wang, Zhengkui and
Ng, Aik Beng and
Cambria, Erik",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1509/",
pages = "30194--30209",
ISBN = "979-8-89176-395-1",
abstract = "Larger language models become simultaneously better and worse at handling contextual information{---}better at ignoring false claims, worse at ignoring irrelevant tokens. We formalize this apparent paradox through the first scaling laws for contextual entrainment, the tendency of models to favor tokens that appeared in context regardless of relevance. Analyzing the Cerebras-GPT (111M{--}13B) and Pythia (14M{--}12B) model families, we find entrainment follows predictable power-law scaling, but with opposite trends depending on context type: semantic contexts show decreasing entrainment with scale, while non-semantic contexts show increasing entrainment. Concretely, the largest models are four times more resistant to counterfactual misinformation than the smallest, yet simultaneously twice as prone to copying arbitrary tokens. These diverging trends, which replicate across model families, suggest that semantic filtering and mechanical copying are functionally distinct behaviors that scale in opposition. These opposing trends suggest that scaling alone does not resolve context sensitivity{---}it reshapes it."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kukreja-etal-2026-better">
<titleInfo>
<title>Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dikshant</namePart>
<namePart type="family">Kukreja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kshitij</namePart>
<namePart type="family">Sah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gautam</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avinash</namePart>
<namePart type="family">Anand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajiv</namePart>
<namePart type="given">Ratn</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengkui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aik</namePart>
<namePart type="given">Beng</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Larger language models become simultaneously better and worse at handling contextual information—better at ignoring false claims, worse at ignoring irrelevant tokens. We formalize this apparent paradox through the first scaling laws for contextual entrainment, the tendency of models to favor tokens that appeared in context regardless of relevance. Analyzing the Cerebras-GPT (111M–13B) and Pythia (14M–12B) model families, we find entrainment follows predictable power-law scaling, but with opposite trends depending on context type: semantic contexts show decreasing entrainment with scale, while non-semantic contexts show increasing entrainment. Concretely, the largest models are four times more resistant to counterfactual misinformation than the smallest, yet simultaneously twice as prone to copying arbitrary tokens. These diverging trends, which replicate across model families, suggest that semantic filtering and mechanical copying are functionally distinct behaviors that scale in opposition. These opposing trends suggest that scaling alone does not resolve context sensitivity—it reshapes it.</abstract>
<identifier type="citekey">kukreja-etal-2026-better</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1509/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30194</start>
<end>30209</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size
%A Kukreja, Dikshant
%A Sah, Kshitij
%A Gupta, Gautam
%A Anand, Avinash
%A Shah, Rajiv Ratn
%A Wang, Zhengkui
%A Ng, Aik Beng
%A Cambria, Erik
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F kukreja-etal-2026-better
%X Larger language models become simultaneously better and worse at handling contextual information—better at ignoring false claims, worse at ignoring irrelevant tokens. We formalize this apparent paradox through the first scaling laws for contextual entrainment, the tendency of models to favor tokens that appeared in context regardless of relevance. Analyzing the Cerebras-GPT (111M–13B) and Pythia (14M–12B) model families, we find entrainment follows predictable power-law scaling, but with opposite trends depending on context type: semantic contexts show decreasing entrainment with scale, while non-semantic contexts show increasing entrainment. Concretely, the largest models are four times more resistant to counterfactual misinformation than the smallest, yet simultaneously twice as prone to copying arbitrary tokens. These diverging trends, which replicate across model families, suggest that semantic filtering and mechanical copying are functionally distinct behaviors that scale in opposition. These opposing trends suggest that scaling alone does not resolve context sensitivity—it reshapes it.
%U https://aclanthology.org/2026.findings-acl.1509/
%P 30194-30209
Markdown (Informal)
[Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size](https://aclanthology.org/2026.findings-acl.1509/) (Kukreja et al., Findings 2026)
ACL
- Dikshant Kukreja, Kshitij Sah, Gautam Gupta, Avinash Anand, Rajiv Ratn Shah, Zhengkui Wang, Aik Beng Ng, and Erik Cambria. 2026. Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30194–30209, San Diego, California, United States. Association for Computational Linguistics.