@inproceedings{chang-etal-2025-watermark,
title = "Watermark Smoothing Attacks against Language Models",
author = "Chang, Hongyan and
Hassani, Hamed and
Shokri, Reza",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.264/",
doi = "10.18653/v1/2025.findings-emnlp.264",
pages = "4915--4941",
ISBN = "979-8-89176-335-7",
abstract = "Watermarking is a key technique for detecting AI-generated text. In this work, we study its vulnerabilities and introduce the Smoothing Attack, a novel watermark removal method. By leveraging the relationship between the model{'}s confidence and watermark detectability, our attack selectively smoothes the watermarked content, erasing watermark traces while preserving text quality. We validate our attack on open-source models ranging from 1.3 B to 30B parameters on 10 different watermarks, demonstrating its effectiveness. Our findings expose critical weaknesses in existing watermarking schemes and highlight the need for stronger defenses."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chang-etal-2025-watermark">
<titleInfo>
<title>Watermark Smoothing Attacks against Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongyan</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamed</namePart>
<namePart type="family">Hassani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reza</namePart>
<namePart type="family">Shokri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Watermarking is a key technique for detecting AI-generated text. In this work, we study its vulnerabilities and introduce the Smoothing Attack, a novel watermark removal method. By leveraging the relationship between the model’s confidence and watermark detectability, our attack selectively smoothes the watermarked content, erasing watermark traces while preserving text quality. We validate our attack on open-source models ranging from 1.3 B to 30B parameters on 10 different watermarks, demonstrating its effectiveness. Our findings expose critical weaknesses in existing watermarking schemes and highlight the need for stronger defenses.</abstract>
<identifier type="citekey">chang-etal-2025-watermark</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.264</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.264/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>4915</start>
<end>4941</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Watermark Smoothing Attacks against Language Models
%A Chang, Hongyan
%A Hassani, Hamed
%A Shokri, Reza
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F chang-etal-2025-watermark
%X Watermarking is a key technique for detecting AI-generated text. In this work, we study its vulnerabilities and introduce the Smoothing Attack, a novel watermark removal method. By leveraging the relationship between the model’s confidence and watermark detectability, our attack selectively smoothes the watermarked content, erasing watermark traces while preserving text quality. We validate our attack on open-source models ranging from 1.3 B to 30B parameters on 10 different watermarks, demonstrating its effectiveness. Our findings expose critical weaknesses in existing watermarking schemes and highlight the need for stronger defenses.
%R 10.18653/v1/2025.findings-emnlp.264
%U https://aclanthology.org/2025.findings-emnlp.264/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.264
%P 4915-4941
Markdown (Informal)
[Watermark Smoothing Attacks against Language Models](https://aclanthology.org/2025.findings-emnlp.264/) (Chang et al., Findings 2025)
ACL