@inproceedings{sileo-2026-logic,
title = "Logic Haystacks: Probing {LLM}s' Long-Context Logical Reasoning (Without Easily Identifiable Unrelated Padding)",
author = "Sileo, Damien",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-short.3/",
pages = "66--75",
ISBN = "979-8-89176-381-4",
abstract = "Large language models demonstrate promising long-context processing capabilities, with recent models touting context windows close to one million tokens. However, the evaluations supporting these claims often involve simple retrieval tasks or synthetic tasks padded with irrelevant text, which models may easily detect and discard. In this work, we generate lengthy, simplified English text with first-order logic representations spanning up to 2048 sentences ({\textasciitilde}25$k GPT-4 tokens). We formulate an evaluation task with evidence retrieval for contradiction detection. The long, homogeneous text is filled with distractors that are both hard to distinguish from relevant evidence and provably non-interfering. Our evaluation of evidence retrieval reveals that the effective context window is much smaller with such realistic distractors, already crumbling at 128 sentences.$"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sileo-2026-logic">
<titleInfo>
<title>Logic Haystacks: Probing LLMs’ Long-Context Logical Reasoning (Without Easily Identifiable Unrelated Padding)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Damien</namePart>
<namePart type="family">Sileo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-381-4</identifier>
</relatedItem>
<abstract>Large language models demonstrate promising long-context processing capabilities, with recent models touting context windows close to one million tokens. However, the evaluations supporting these claims often involve simple retrieval tasks or synthetic tasks padded with irrelevant text, which models may easily detect and discard. In this work, we generate lengthy, simplified English text with first-order logic representations spanning up to 2048 sentences (~25k GPT-4 tokens). We formulate an evaluation task with evidence retrieval for contradiction detection. The long, homogeneous text is filled with distractors that are both hard to distinguish from relevant evidence and provably non-interfering. Our evaluation of evidence retrieval reveals that the effective context window is much smaller with such realistic distractors, already crumbling at 128 sentences.</abstract>
<identifier type="citekey">sileo-2026-logic</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-short.3/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>66</start>
<end>75</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Logic Haystacks: Probing LLMs’ Long-Context Logical Reasoning (Without Easily Identifiable Unrelated Padding)
%A Sileo, Damien
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-381-4
%F sileo-2026-logic
%X Large language models demonstrate promising long-context processing capabilities, with recent models touting context windows close to one million tokens. However, the evaluations supporting these claims often involve simple retrieval tasks or synthetic tasks padded with irrelevant text, which models may easily detect and discard. In this work, we generate lengthy, simplified English text with first-order logic representations spanning up to 2048 sentences (~25k GPT-4 tokens). We formulate an evaluation task with evidence retrieval for contradiction detection. The long, homogeneous text is filled with distractors that are both hard to distinguish from relevant evidence and provably non-interfering. Our evaluation of evidence retrieval reveals that the effective context window is much smaller with such realistic distractors, already crumbling at 128 sentences.
%U https://aclanthology.org/2026.eacl-short.3/
%P 66-75
Markdown (Informal)
[Logic Haystacks: Probing LLMs’ Long-Context Logical Reasoning (Without Easily Identifiable Unrelated Padding)](https://aclanthology.org/2026.eacl-short.3/) (Sileo, EACL 2026)
ACL