@inproceedings{wu-chandrasekaran-2024-bypassing,
title = "Bypassing {LLM} Watermarks with Color-Aware Substitutions",
author = "Wu, Qilong and
Chandrasekaran, Varun",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.464/",
doi = "10.18653/v1/2024.acl-long.464",
pages = "8549--8581",
abstract = "Watermarking approaches are proposed to identify if text being circulated is human- or large language model- (LLM) generated. The state-of-the-art watermarking strategy of Kirchenbauer et al. (2023a) biases the LLM to generate specific ({\textquotedblleft}green{\textquotedblright}) tokens. However, determining the robustness of this watermarking method under finite (low) edit budgets is an open problem. Additionally, existing attack methods failto evade detection for longer text segments. We overcome these limitations, and propose Self Color Testing-based Substitution (SCTS), thefirst {\textquotedblleft}color-aware{\textquotedblright} attack. SCTS obtains color information by strategically prompting the watermarked LLM and comparing output tokensfrequencies. It uses this information to determine token colors, and substitutes green tokens with non-green ones. In our experiments, SCTS successfully evades watermark detection using fewer number of edits than related work. Additionally, we show both theoretically and empirically that SCTS can remove the watermark for arbitrarily long watermarked text."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-chandrasekaran-2024-bypassing">
<titleInfo>
<title>Bypassing LLM Watermarks with Color-Aware Substitutions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qilong</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Chandrasekaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Watermarking approaches are proposed to identify if text being circulated is human- or large language model- (LLM) generated. The state-of-the-art watermarking strategy of Kirchenbauer et al. (2023a) biases the LLM to generate specific (“green”) tokens. However, determining the robustness of this watermarking method under finite (low) edit budgets is an open problem. Additionally, existing attack methods failto evade detection for longer text segments. We overcome these limitations, and propose Self Color Testing-based Substitution (SCTS), thefirst “color-aware” attack. SCTS obtains color information by strategically prompting the watermarked LLM and comparing output tokensfrequencies. It uses this information to determine token colors, and substitutes green tokens with non-green ones. In our experiments, SCTS successfully evades watermark detection using fewer number of edits than related work. Additionally, we show both theoretically and empirically that SCTS can remove the watermark for arbitrarily long watermarked text.</abstract>
<identifier type="citekey">wu-chandrasekaran-2024-bypassing</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.464</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.464/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>8549</start>
<end>8581</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bypassing LLM Watermarks with Color-Aware Substitutions
%A Wu, Qilong
%A Chandrasekaran, Varun
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F wu-chandrasekaran-2024-bypassing
%X Watermarking approaches are proposed to identify if text being circulated is human- or large language model- (LLM) generated. The state-of-the-art watermarking strategy of Kirchenbauer et al. (2023a) biases the LLM to generate specific (“green”) tokens. However, determining the robustness of this watermarking method under finite (low) edit budgets is an open problem. Additionally, existing attack methods failto evade detection for longer text segments. We overcome these limitations, and propose Self Color Testing-based Substitution (SCTS), thefirst “color-aware” attack. SCTS obtains color information by strategically prompting the watermarked LLM and comparing output tokensfrequencies. It uses this information to determine token colors, and substitutes green tokens with non-green ones. In our experiments, SCTS successfully evades watermark detection using fewer number of edits than related work. Additionally, we show both theoretically and empirically that SCTS can remove the watermark for arbitrarily long watermarked text.
%R 10.18653/v1/2024.acl-long.464
%U https://aclanthology.org/2024.luhme-long.464/
%U https://doi.org/10.18653/v1/2024.acl-long.464
%P 8549-8581
Markdown (Informal)
[Bypassing LLM Watermarks with Color-Aware Substitutions](https://aclanthology.org/2024.luhme-long.464/) (Wu & Chandrasekaran, ACL 2024)
ACL
- Qilong Wu and Varun Chandrasekaran. 2024. Bypassing LLM Watermarks with Color-Aware Substitutions. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8549–8581, Bangkok, Thailand. Association for Computational Linguistics.