@inproceedings{qin-etal-2026-entrobench,
title = "{E}ntro{B}ench: Evaluating {LLM} Watermarking Under Multi-Entropy Scenarios and Practical User Operations",
author = "Qin, Pengyuan and
Tu, Linnan and
Ke, Yuhan and
Ling, Hefei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.2089/",
pages = "42101--42118",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) watermarking has been proposed as an active approach for content provenance verification, yet existing evaluations are largely confined to fixed entropy settings. In this paper, we introduce EntroBench, a benchmark for LLM watermarking that systematically covers three entropy levels and seven representative tasks. We conducted a fair evaluation of eight watermarking methods through hyper-parameter search based on an anchored dataset. We find that current approaches struggle to perform consistently across different entropy levels. Our analysis reveals a clear trade-off between watermark detectability and downstream output quality that varies across tasks and entropy conditions. Furthermore, we assess watermark robustness under realistic user interaction scenarios and show that common, non-adversarial user behaviors can substantially degrade watermark signals. These results indicate that practical usage-driven perturbations pose a significant challenge to current watermarking techniques. EntroBench provides a unified evaluation framework for studying these issues and supports the development of more adaptive and robust LLM watermarking methods. Dataset and codes are available at https://github.com/py-qin/EntroBench."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qin-etal-2026-entrobench">
<titleInfo>
<title>EntroBench: Evaluating LLM Watermarking Under Multi-Entropy Scenarios and Practical User Operations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pengyuan</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linnan</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhan</namePart>
<namePart type="family">Ke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hefei</namePart>
<namePart type="family">Ling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) watermarking has been proposed as an active approach for content provenance verification, yet existing evaluations are largely confined to fixed entropy settings. In this paper, we introduce EntroBench, a benchmark for LLM watermarking that systematically covers three entropy levels and seven representative tasks. We conducted a fair evaluation of eight watermarking methods through hyper-parameter search based on an anchored dataset. We find that current approaches struggle to perform consistently across different entropy levels. Our analysis reveals a clear trade-off between watermark detectability and downstream output quality that varies across tasks and entropy conditions. Furthermore, we assess watermark robustness under realistic user interaction scenarios and show that common, non-adversarial user behaviors can substantially degrade watermark signals. These results indicate that practical usage-driven perturbations pose a significant challenge to current watermarking techniques. EntroBench provides a unified evaluation framework for studying these issues and supports the development of more adaptive and robust LLM watermarking methods. Dataset and codes are available at https://github.com/py-qin/EntroBench.</abstract>
<identifier type="citekey">qin-etal-2026-entrobench</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.2089/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>42101</start>
<end>42118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EntroBench: Evaluating LLM Watermarking Under Multi-Entropy Scenarios and Practical User Operations
%A Qin, Pengyuan
%A Tu, Linnan
%A Ke, Yuhan
%A Ling, Hefei
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F qin-etal-2026-entrobench
%X Large language models (LLMs) watermarking has been proposed as an active approach for content provenance verification, yet existing evaluations are largely confined to fixed entropy settings. In this paper, we introduce EntroBench, a benchmark for LLM watermarking that systematically covers three entropy levels and seven representative tasks. We conducted a fair evaluation of eight watermarking methods through hyper-parameter search based on an anchored dataset. We find that current approaches struggle to perform consistently across different entropy levels. Our analysis reveals a clear trade-off between watermark detectability and downstream output quality that varies across tasks and entropy conditions. Furthermore, we assess watermark robustness under realistic user interaction scenarios and show that common, non-adversarial user behaviors can substantially degrade watermark signals. These results indicate that practical usage-driven perturbations pose a significant challenge to current watermarking techniques. EntroBench provides a unified evaluation framework for studying these issues and supports the development of more adaptive and robust LLM watermarking methods. Dataset and codes are available at https://github.com/py-qin/EntroBench.
%U https://aclanthology.org/2026.findings-acl.2089/
%P 42101-42118
Markdown (Informal)
[EntroBench: Evaluating LLM Watermarking Under Multi-Entropy Scenarios and Practical User Operations](https://aclanthology.org/2026.findings-acl.2089/) (Qin et al., Findings 2026)
ACL