@inproceedings{feng-etal-2026-tracking,
title = "Tracking the Limits of Knowledge Propagation: How {LLM}s Fail at Multi-Step Reasoning with Conflicting Knowledge",
author = "Feng, Yiyang and
Chen, Zeming and
Wu, Haotian and
Zhou, Jiawei and
Bosselut, Antoine",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.273/",
pages = "5813--5847",
ISBN = "979-8-89176-380-7",
abstract = "A common solution for mitigating outdated or incorrect information in Large Language Models (LLMs) is to provide updated facts in-context or through knowledge editing. However, these methods introduce knowledge conflicts when the knowledge update fails to overwrite the model{'}s parametric knowledge, which propagate to faulty reasoning. Current benchmarks for this problem, however, largely focus only on single knowledge updates and fact recall without evaluating how these updates affect downstream reasoning. In this work, we introduce Tʀᴀᴄᴋ (*Testing Reasoning Amid Conflicting Knowledge*), a new benchmark for studying how LLMs propagate new knowledge through multi-step reasoning when it conflicts with the model{'}s initial parametric knowledge. Spanning three reasoning-intensive scenarios (WIKI, CODE, and MATH), Tʀᴀᴄᴋ introduces multiple, realistic conflicts to mirror real-world complexity. Our results on Tʀᴀᴄᴋ reveal that providing updated facts to models for reasoning can worsen performance compared to providing no updated facts to a model, and that this performance degradation exacerbates as more updated facts are provided. We show this failure stems from both inability to faithfully integrate updated facts, but also flawed reasoning even when knowledge is integrated. Tʀᴀᴄᴋ provides a rigorous new benchmark to measure and guide future progress on propagating conflicting knowledge in multi-step reasoning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="feng-etal-2026-tracking">
<titleInfo>
<title>Tracking the Limits of Knowledge Propagation: How LLMs Fail at Multi-Step Reasoning with Conflicting Knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiyang</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeming</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haotian</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiawei</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>A common solution for mitigating outdated or incorrect information in Large Language Models (LLMs) is to provide updated facts in-context or through knowledge editing. However, these methods introduce knowledge conflicts when the knowledge update fails to overwrite the model’s parametric knowledge, which propagate to faulty reasoning. Current benchmarks for this problem, however, largely focus only on single knowledge updates and fact recall without evaluating how these updates affect downstream reasoning. In this work, we introduce Tʀᴀᴄᴋ (*Testing Reasoning Amid Conflicting Knowledge*), a new benchmark for studying how LLMs propagate new knowledge through multi-step reasoning when it conflicts with the model’s initial parametric knowledge. Spanning three reasoning-intensive scenarios (WIKI, CODE, and MATH), Tʀᴀᴄᴋ introduces multiple, realistic conflicts to mirror real-world complexity. Our results on Tʀᴀᴄᴋ reveal that providing updated facts to models for reasoning can worsen performance compared to providing no updated facts to a model, and that this performance degradation exacerbates as more updated facts are provided. We show this failure stems from both inability to faithfully integrate updated facts, but also flawed reasoning even when knowledge is integrated. Tʀᴀᴄᴋ provides a rigorous new benchmark to measure and guide future progress on propagating conflicting knowledge in multi-step reasoning.</abstract>
<identifier type="citekey">feng-etal-2026-tracking</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.273/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>5813</start>
<end>5847</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tracking the Limits of Knowledge Propagation: How LLMs Fail at Multi-Step Reasoning with Conflicting Knowledge
%A Feng, Yiyang
%A Chen, Zeming
%A Wu, Haotian
%A Zhou, Jiawei
%A Bosselut, Antoine
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F feng-etal-2026-tracking
%X A common solution for mitigating outdated or incorrect information in Large Language Models (LLMs) is to provide updated facts in-context or through knowledge editing. However, these methods introduce knowledge conflicts when the knowledge update fails to overwrite the model’s parametric knowledge, which propagate to faulty reasoning. Current benchmarks for this problem, however, largely focus only on single knowledge updates and fact recall without evaluating how these updates affect downstream reasoning. In this work, we introduce Tʀᴀᴄᴋ (*Testing Reasoning Amid Conflicting Knowledge*), a new benchmark for studying how LLMs propagate new knowledge through multi-step reasoning when it conflicts with the model’s initial parametric knowledge. Spanning three reasoning-intensive scenarios (WIKI, CODE, and MATH), Tʀᴀᴄᴋ introduces multiple, realistic conflicts to mirror real-world complexity. Our results on Tʀᴀᴄᴋ reveal that providing updated facts to models for reasoning can worsen performance compared to providing no updated facts to a model, and that this performance degradation exacerbates as more updated facts are provided. We show this failure stems from both inability to faithfully integrate updated facts, but also flawed reasoning even when knowledge is integrated. Tʀᴀᴄᴋ provides a rigorous new benchmark to measure and guide future progress on propagating conflicting knowledge in multi-step reasoning.
%U https://aclanthology.org/2026.eacl-long.273/
%P 5813-5847
Markdown (Informal)
[Tracking the Limits of Knowledge Propagation: How LLMs Fail at Multi-Step Reasoning with Conflicting Knowledge](https://aclanthology.org/2026.eacl-long.273/) (Feng et al., EACL 2026)
ACL