@inproceedings{liu-2026-wordle,
title = "From Wordle to Fibble$^5$: Evaluating {LLM} Reasoning Under Escalating Deception",
author = "Liu, Chang",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.evaleval-1.5/",
pages = "36--45",
ISBN = "979-8-89176-429-3",
abstract = "Standard benchmarks for large language models (LLMs) assume that task feedback is truthful, but real-world reasoning often requires processing unreliable or adversarial information. We introduce WordleArenas, a benchmark platform that evaluates LLM reasoning robustness across a deception gradient. Building on Wordle and its deceptive variant Fibble (Chusap et al., 2025), we generalize to Fibblek (k = 0, . . . , 5 lies per row), creating a controlled evaluation of LLM robustness to misinformation. Across six arenas {---} standard Wordle (0 lies per row) through Fibble5 (5 lies per row) {---} we evaluate 41 models from 10 providers across 3,749 games. We find that (1) even one lie per row causes catastrophic performance drops (average win rate falls from 41.1{\%} to 18.7{\%}), (2) a sharp deception cliff emerges at 2{--}3 lies where nearly all models collapse to {\ensuremath{\leq}}3{\%} win rate, and (3) model robustness to deception is poorly predicted by standard benchmark rankings. A surprising Fibble5 recovery emerges: some models recover partial performance when all feedback lies (average 9.5{\%}), outperforming Fibble3 (0.3{\%}) and Fibble4 (0.4{\%}), because knowing that every tile lies restores deterministic {---} though partial {---} information. Our results demonstrate that truthful-feedback evaluations systematically overestimate LLM reasoning capabilities and that deception-aware benchmarks are essential for assessing real-world robustness. All code and data are publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-2026-wordle">
<titleInfo>
<title>From Wordle to Fibble⁵: Evaluating LLM Reasoning Under Escalating Deception</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Evaluating Evaluations (EvalEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mubashara</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Batzner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Gohar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Mickel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ichhya</namePart>
<namePart type="family">Pant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Talat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-429-3</identifier>
</relatedItem>
<abstract>Standard benchmarks for large language models (LLMs) assume that task feedback is truthful, but real-world reasoning often requires processing unreliable or adversarial information. We introduce WordleArenas, a benchmark platform that evaluates LLM reasoning robustness across a deception gradient. Building on Wordle and its deceptive variant Fibble (Chusap et al., 2025), we generalize to Fibblek (k = 0, . . . , 5 lies per row), creating a controlled evaluation of LLM robustness to misinformation. Across six arenas — standard Wordle (0 lies per row) through Fibble5 (5 lies per row) — we evaluate 41 models from 10 providers across 3,749 games. We find that (1) even one lie per row causes catastrophic performance drops (average win rate falls from 41.1% to 18.7%), (2) a sharp deception cliff emerges at 2–3 lies where nearly all models collapse to \ensuremathłeq3% win rate, and (3) model robustness to deception is poorly predicted by standard benchmark rankings. A surprising Fibble5 recovery emerges: some models recover partial performance when all feedback lies (average 9.5%), outperforming Fibble3 (0.3%) and Fibble4 (0.4%), because knowing that every tile lies restores deterministic — though partial — information. Our results demonstrate that truthful-feedback evaluations systematically overestimate LLM reasoning capabilities and that deception-aware benchmarks are essential for assessing real-world robustness. All code and data are publicly available.</abstract>
<identifier type="citekey">liu-2026-wordle</identifier>
<location>
<url>https://aclanthology.org/2026.evaleval-1.5/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36</start>
<end>45</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Wordle to Fibble⁵: Evaluating LLM Reasoning Under Escalating Deception
%A Liu, Chang
%Y Akhtar, Mubashara
%Y Batzner, Jan
%Y Choshen, Leshem
%Y Ghosh, Avijit
%Y Gohar, Usman
%Y Mickel, Jennifer
%Y Pant, Ichhya
%Y Talat, Zeerak
%Y Lin, Michelle
%S Proceedings of the Workshop on Evaluating Evaluations (EvalEval)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-429-3
%F liu-2026-wordle
%X Standard benchmarks for large language models (LLMs) assume that task feedback is truthful, but real-world reasoning often requires processing unreliable or adversarial information. We introduce WordleArenas, a benchmark platform that evaluates LLM reasoning robustness across a deception gradient. Building on Wordle and its deceptive variant Fibble (Chusap et al., 2025), we generalize to Fibblek (k = 0, . . . , 5 lies per row), creating a controlled evaluation of LLM robustness to misinformation. Across six arenas — standard Wordle (0 lies per row) through Fibble5 (5 lies per row) — we evaluate 41 models from 10 providers across 3,749 games. We find that (1) even one lie per row causes catastrophic performance drops (average win rate falls from 41.1% to 18.7%), (2) a sharp deception cliff emerges at 2–3 lies where nearly all models collapse to \ensuremathłeq3% win rate, and (3) model robustness to deception is poorly predicted by standard benchmark rankings. A surprising Fibble5 recovery emerges: some models recover partial performance when all feedback lies (average 9.5%), outperforming Fibble3 (0.3%) and Fibble4 (0.4%), because knowing that every tile lies restores deterministic — though partial — information. Our results demonstrate that truthful-feedback evaluations systematically overestimate LLM reasoning capabilities and that deception-aware benchmarks are essential for assessing real-world robustness. All code and data are publicly available.
%U https://aclanthology.org/2026.evaleval-1.5/
%P 36-45
Markdown (Informal)
[From Wordle to Fibble5: Evaluating LLM Reasoning Under Escalating Deception](https://aclanthology.org/2026.evaleval-1.5/) (Liu, EvalEval 2026)
ACL