@inproceedings{harel-canada-etal-2025-sandcastles,
title = "Sandcastles in the Storm: Revisiting the (Im)possibility of Strong Watermarking",
author = "Harel-Canada, Fabrice Y and
Erol, Boran and
Choi, Connor and
Liu, Jason and
Song, Gary Jiarui and
Peng, Nanyun and
Sahai, Amit",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1436/",
doi = "10.18653/v1/2025.acl-long.1436",
pages = "29698--29735",
ISBN = "979-8-89176-251-0",
abstract = "Watermarking AI-generated text is critical for combating misuse. Yet recent theoretical work argues that any watermark can be erased via random walk attacks that perturb text while preserving quality. However, such attacks rely on two key assumptions: (1) rapid mixing (watermarks dissolve quickly under perturbations) and (2) reliable quality preservation (automated quality oracles perfectly guide edits). Through large-scale experiments and human-validated assessments, we find mixing is slow: 100{\%} of perturbed texts retain traces of their origin after hundreds of edits, defying rapid mixing. Oracles falter, as state-of-the-art quality detectors misjudge edits (77{\%} accuracy), compounding errors during attacks. Ultimately, attacks underperform: automated walks remove watermarks just 26{\%} of the time {--} dropping to 10{\%} under human quality review. These findings challenge the inevitability of watermark removal. Instead, practical barriers {--} slow mixing and imperfect quality control {--} reveal watermarking to be far more robust than theoretical models suggest. The gap between idealized attacks and real-world feasibility underscores the need for stronger watermarking methods and more realistic attack models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="harel-canada-etal-2025-sandcastles">
<titleInfo>
<title>Sandcastles in the Storm: Revisiting the (Im)possibility of Strong Watermarking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Harel-Canada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boran</namePart>
<namePart type="family">Erol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Connor</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gary</namePart>
<namePart type="given">Jiarui</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amit</namePart>
<namePart type="family">Sahai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Watermarking AI-generated text is critical for combating misuse. Yet recent theoretical work argues that any watermark can be erased via random walk attacks that perturb text while preserving quality. However, such attacks rely on two key assumptions: (1) rapid mixing (watermarks dissolve quickly under perturbations) and (2) reliable quality preservation (automated quality oracles perfectly guide edits). Through large-scale experiments and human-validated assessments, we find mixing is slow: 100% of perturbed texts retain traces of their origin after hundreds of edits, defying rapid mixing. Oracles falter, as state-of-the-art quality detectors misjudge edits (77% accuracy), compounding errors during attacks. Ultimately, attacks underperform: automated walks remove watermarks just 26% of the time – dropping to 10% under human quality review. These findings challenge the inevitability of watermark removal. Instead, practical barriers – slow mixing and imperfect quality control – reveal watermarking to be far more robust than theoretical models suggest. The gap between idealized attacks and real-world feasibility underscores the need for stronger watermarking methods and more realistic attack models.</abstract>
<identifier type="citekey">harel-canada-etal-2025-sandcastles</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1436</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1436/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>29698</start>
<end>29735</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sandcastles in the Storm: Revisiting the (Im)possibility of Strong Watermarking
%A Harel-Canada, Fabrice Y.
%A Erol, Boran
%A Choi, Connor
%A Liu, Jason
%A Song, Gary Jiarui
%A Peng, Nanyun
%A Sahai, Amit
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F harel-canada-etal-2025-sandcastles
%X Watermarking AI-generated text is critical for combating misuse. Yet recent theoretical work argues that any watermark can be erased via random walk attacks that perturb text while preserving quality. However, such attacks rely on two key assumptions: (1) rapid mixing (watermarks dissolve quickly under perturbations) and (2) reliable quality preservation (automated quality oracles perfectly guide edits). Through large-scale experiments and human-validated assessments, we find mixing is slow: 100% of perturbed texts retain traces of their origin after hundreds of edits, defying rapid mixing. Oracles falter, as state-of-the-art quality detectors misjudge edits (77% accuracy), compounding errors during attacks. Ultimately, attacks underperform: automated walks remove watermarks just 26% of the time – dropping to 10% under human quality review. These findings challenge the inevitability of watermark removal. Instead, practical barriers – slow mixing and imperfect quality control – reveal watermarking to be far more robust than theoretical models suggest. The gap between idealized attacks and real-world feasibility underscores the need for stronger watermarking methods and more realistic attack models.
%R 10.18653/v1/2025.acl-long.1436
%U https://aclanthology.org/2025.acl-long.1436/
%U https://doi.org/10.18653/v1/2025.acl-long.1436
%P 29698-29735
Markdown (Informal)
[Sandcastles in the Storm: Revisiting the (Im)possibility of Strong Watermarking](https://aclanthology.org/2025.acl-long.1436/) (Harel-Canada et al., ACL 2025)
ACL