@inproceedings{beetham-etal-2026-jailbreaks,
title = "Jailbreaks as Inference-Time Alignment: A Framework for Understanding Safety Failures in {LLM}s",
author = "Beetham, James and
Chakraborty, Souradip and
Wang, Mengdi and
Huang, Furong and
Bedi, Amrit Singh and
Shah, Mubarak",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.360/",
pages = "7689--7713",
ISBN = "979-8-89176-380-7",
abstract = "Large language models (LLMs) are safety-aligned to prevent harmful response generation, yet still remain vulnerable to jailbreak attacks. While prior works have focused on improving jailbreak attack effectiveness, they offer little explanation for why safety alignment fails. We address this gap by framing jailbreaks as inference-time alignment, connecting attack design and safety alignment within a unified optimization framework. This framing allows us to extend best-of-N inference-time alignment to the adversarial setting, called LIAR (Leveraging Inference-time Alignment to jailbReak), and derive suboptimality bounds that show LIAR provably approaches an optimal jailbreak as compute scales. Interestingly, our framework allows us to develop the notion of a Safety-Net, a measure of how vulnerable an LLM is to jailbreaks, which helps to explain why safety alignment can fail. Empirically, LIAR produces natural, hard-to-detect prompts that achieve a competitive attack success rate while running 10 to 100x faster than prior suffix-based jailbreaks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="beetham-etal-2026-jailbreaks">
<titleInfo>
<title>Jailbreaks as Inference-Time Alignment: A Framework for Understanding Safety Failures in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Beetham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Souradip</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengdi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Furong</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amrit</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Bedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mubarak</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are safety-aligned to prevent harmful response generation, yet still remain vulnerable to jailbreak attacks. While prior works have focused on improving jailbreak attack effectiveness, they offer little explanation for why safety alignment fails. We address this gap by framing jailbreaks as inference-time alignment, connecting attack design and safety alignment within a unified optimization framework. This framing allows us to extend best-of-N inference-time alignment to the adversarial setting, called LIAR (Leveraging Inference-time Alignment to jailbReak), and derive suboptimality bounds that show LIAR provably approaches an optimal jailbreak as compute scales. Interestingly, our framework allows us to develop the notion of a Safety-Net, a measure of how vulnerable an LLM is to jailbreaks, which helps to explain why safety alignment can fail. Empirically, LIAR produces natural, hard-to-detect prompts that achieve a competitive attack success rate while running 10 to 100x faster than prior suffix-based jailbreaks.</abstract>
<identifier type="citekey">beetham-etal-2026-jailbreaks</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.360/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7689</start>
<end>7713</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Jailbreaks as Inference-Time Alignment: A Framework for Understanding Safety Failures in LLMs
%A Beetham, James
%A Chakraborty, Souradip
%A Wang, Mengdi
%A Huang, Furong
%A Bedi, Amrit Singh
%A Shah, Mubarak
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F beetham-etal-2026-jailbreaks
%X Large language models (LLMs) are safety-aligned to prevent harmful response generation, yet still remain vulnerable to jailbreak attacks. While prior works have focused on improving jailbreak attack effectiveness, they offer little explanation for why safety alignment fails. We address this gap by framing jailbreaks as inference-time alignment, connecting attack design and safety alignment within a unified optimization framework. This framing allows us to extend best-of-N inference-time alignment to the adversarial setting, called LIAR (Leveraging Inference-time Alignment to jailbReak), and derive suboptimality bounds that show LIAR provably approaches an optimal jailbreak as compute scales. Interestingly, our framework allows us to develop the notion of a Safety-Net, a measure of how vulnerable an LLM is to jailbreaks, which helps to explain why safety alignment can fail. Empirically, LIAR produces natural, hard-to-detect prompts that achieve a competitive attack success rate while running 10 to 100x faster than prior suffix-based jailbreaks.
%U https://aclanthology.org/2026.eacl-long.360/
%P 7689-7713
Markdown (Informal)
[Jailbreaks as Inference-Time Alignment: A Framework for Understanding Safety Failures in LLMs](https://aclanthology.org/2026.eacl-long.360/) (Beetham et al., EACL 2026)
ACL