@inproceedings{liu-etal-2026-reasoning,
title = "Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models",
author = "Liu, Yuansen and
Tang, Yixuan and
Tung, Anthony Kum Hoe",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1698/",
doi = "10.18653/v1/2026.acl-long.1698",
pages = "36646--36665",
ISBN = "979-8-89176-390-6",
abstract = "Current LLM safety research predominantly focuses on mitigating **Goal Hijacking**, preventing attackers from redirecting a model{'}s high-level objective (e.g., from ``summarizing emails'' to ``phishing users''). In this paper, we argue that this perspective is incomplete and highlight a critical vulnerability in **Reasoning Alignment**. We expose the inherent fragility of current alignment techniques by proposing a new adversarial prompt attack paradigm: **Reasoning Hijacking**. To demonstrate this vulnerability, we instantiate it via the **Criteria Attack**, which subverts model judgments by injecting spurious decision criteria without altering the high-level task goal. Unlike Goal Hijacking, which attempts to override the system prompt, Reasoning Hijacking keeps the task goal intact but manipulates the model{'}s decision-making logic by injecting spurious reasoning shortcuts. Through extensive experiments on three different tasks (toxic comment, negative review, and spam detection), we demonstrate that even state-of-the-art models are highly fragile, consistently prioritizing injected heuristic shortcuts over rigorous semantic analysis. Crucially, because the model{'}s explicit intent remains aligned with the user{'}s instructions, these attacks can bypass defenses designed to detect goal deviation (e.g., SecAlign, StruQ), revealing a fundamental blind spot in the current safety landscape. Data and code are available at [https://github.com/Yuan-Hou/criteria{\_}attack](https://github.com/Yuan-Hou/criteria{\_}attack)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-reasoning">
<titleInfo>
<title>Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuansen</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixuan</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anthony</namePart>
<namePart type="given">Kum</namePart>
<namePart type="given">Hoe</namePart>
<namePart type="family">Tung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Current LLM safety research predominantly focuses on mitigating **Goal Hijacking**, preventing attackers from redirecting a model’s high-level objective (e.g., from “summarizing emails” to “phishing users”). In this paper, we argue that this perspective is incomplete and highlight a critical vulnerability in **Reasoning Alignment**. We expose the inherent fragility of current alignment techniques by proposing a new adversarial prompt attack paradigm: **Reasoning Hijacking**. To demonstrate this vulnerability, we instantiate it via the **Criteria Attack**, which subverts model judgments by injecting spurious decision criteria without altering the high-level task goal. Unlike Goal Hijacking, which attempts to override the system prompt, Reasoning Hijacking keeps the task goal intact but manipulates the model’s decision-making logic by injecting spurious reasoning shortcuts. Through extensive experiments on three different tasks (toxic comment, negative review, and spam detection), we demonstrate that even state-of-the-art models are highly fragile, consistently prioritizing injected heuristic shortcuts over rigorous semantic analysis. Crucially, because the model’s explicit intent remains aligned with the user’s instructions, these attacks can bypass defenses designed to detect goal deviation (e.g., SecAlign, StruQ), revealing a fundamental blind spot in the current safety landscape. Data and code are available at [https://github.com/Yuan-Hou/criteria_attack](https://github.com/Yuan-Hou/criteria_attack).</abstract>
<identifier type="citekey">liu-etal-2026-reasoning</identifier>
<identifier type="doi">10.18653/v1/2026.acl-long.1698</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1698/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36646</start>
<end>36665</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models
%A Liu, Yuansen
%A Tang, Yixuan
%A Tung, Anthony Kum Hoe
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F liu-etal-2026-reasoning
%X Current LLM safety research predominantly focuses on mitigating **Goal Hijacking**, preventing attackers from redirecting a model’s high-level objective (e.g., from “summarizing emails” to “phishing users”). In this paper, we argue that this perspective is incomplete and highlight a critical vulnerability in **Reasoning Alignment**. We expose the inherent fragility of current alignment techniques by proposing a new adversarial prompt attack paradigm: **Reasoning Hijacking**. To demonstrate this vulnerability, we instantiate it via the **Criteria Attack**, which subverts model judgments by injecting spurious decision criteria without altering the high-level task goal. Unlike Goal Hijacking, which attempts to override the system prompt, Reasoning Hijacking keeps the task goal intact but manipulates the model’s decision-making logic by injecting spurious reasoning shortcuts. Through extensive experiments on three different tasks (toxic comment, negative review, and spam detection), we demonstrate that even state-of-the-art models are highly fragile, consistently prioritizing injected heuristic shortcuts over rigorous semantic analysis. Crucially, because the model’s explicit intent remains aligned with the user’s instructions, these attacks can bypass defenses designed to detect goal deviation (e.g., SecAlign, StruQ), revealing a fundamental blind spot in the current safety landscape. Data and code are available at [https://github.com/Yuan-Hou/criteria_attack](https://github.com/Yuan-Hou/criteria_attack).
%R 10.18653/v1/2026.acl-long.1698
%U https://aclanthology.org/2026.acl-long.1698/
%U https://doi.org/10.18653/v1/2026.acl-long.1698
%P 36646-36665
Markdown (Informal)
[Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models](https://aclanthology.org/2026.acl-long.1698/) (Liu et al., ACL 2026)
ACL