@inproceedings{han-etal-2026-measuring,
title = "Measuring Watermarking under Jailbreaking: {ASR} Inflation and Goal-Compliance Mismatch",
author = "Han, Sungwoo and
Moon, Sangjun and
Kwon, Jingun and
Kamigaito, Hidetaka and
Okumura, Manabu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1797/",
pages = "36071--36083",
ISBN = "979-8-89176-395-1",
abstract = "Recently, watermarking has attracted growing attention as a practical technique for source attribution of machine-generated text. However, most prior work studies watermarking under benign prompts, while its behavior under jailbreaking prompts remains underexplored. This gap matters because jailbreaking can bypass safety policies and shift the generation regime, raising concerns that watermarking may interact with model alignment under attack. To address this gap, we evaluate six watermarking methods on four LLMs across two jailbreak benchmarks and three settings: Static, AutoDAN, and DSN. We find that watermarking can inflate judge-based attack success rate, denoted ASR, under jailbreaking, with the largest effects appearing in biased schemes that perturb logits. At the same time, these ASR increases often do not reflect higher harmful-goal compliance when measured by StrongREJECT or by human judgments. This suggests that ASR-only evaluations can be brittle to decoding perturbations and may overestimate harmful-goal compliance, motivating complementary goal-compliance metrics (e.g., StrongREJECT) and human evaluations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2026-measuring">
<titleInfo>
<title>Measuring Watermarking under Jailbreaking: ASR Inflation and Goal-Compliance Mismatch</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sungwoo</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangjun</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingun</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hidetaka</namePart>
<namePart type="family">Kamigaito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manabu</namePart>
<namePart type="family">Okumura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recently, watermarking has attracted growing attention as a practical technique for source attribution of machine-generated text. However, most prior work studies watermarking under benign prompts, while its behavior under jailbreaking prompts remains underexplored. This gap matters because jailbreaking can bypass safety policies and shift the generation regime, raising concerns that watermarking may interact with model alignment under attack. To address this gap, we evaluate six watermarking methods on four LLMs across two jailbreak benchmarks and three settings: Static, AutoDAN, and DSN. We find that watermarking can inflate judge-based attack success rate, denoted ASR, under jailbreaking, with the largest effects appearing in biased schemes that perturb logits. At the same time, these ASR increases often do not reflect higher harmful-goal compliance when measured by StrongREJECT or by human judgments. This suggests that ASR-only evaluations can be brittle to decoding perturbations and may overestimate harmful-goal compliance, motivating complementary goal-compliance metrics (e.g., StrongREJECT) and human evaluations.</abstract>
<identifier type="citekey">han-etal-2026-measuring</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1797/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36071</start>
<end>36083</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Measuring Watermarking under Jailbreaking: ASR Inflation and Goal-Compliance Mismatch
%A Han, Sungwoo
%A Moon, Sangjun
%A Kwon, Jingun
%A Kamigaito, Hidetaka
%A Okumura, Manabu
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F han-etal-2026-measuring
%X Recently, watermarking has attracted growing attention as a practical technique for source attribution of machine-generated text. However, most prior work studies watermarking under benign prompts, while its behavior under jailbreaking prompts remains underexplored. This gap matters because jailbreaking can bypass safety policies and shift the generation regime, raising concerns that watermarking may interact with model alignment under attack. To address this gap, we evaluate six watermarking methods on four LLMs across two jailbreak benchmarks and three settings: Static, AutoDAN, and DSN. We find that watermarking can inflate judge-based attack success rate, denoted ASR, under jailbreaking, with the largest effects appearing in biased schemes that perturb logits. At the same time, these ASR increases often do not reflect higher harmful-goal compliance when measured by StrongREJECT or by human judgments. This suggests that ASR-only evaluations can be brittle to decoding perturbations and may overestimate harmful-goal compliance, motivating complementary goal-compliance metrics (e.g., StrongREJECT) and human evaluations.
%U https://aclanthology.org/2026.findings-acl.1797/
%P 36071-36083
Markdown (Informal)
[Measuring Watermarking under Jailbreaking: ASR Inflation and Goal-Compliance Mismatch](https://aclanthology.org/2026.findings-acl.1797/) (Han et al., Findings 2026)
ACL