@inproceedings{lu-etal-2025-wasa,
title = "{WASA}: {WA}termark-based Source Attribution for Large Language Model-Generated Data",
author = "Lu, Xinyang and
Wang, Jingtan and
Zhao, Zitong and
Dai, Zhongxiang and
Foo, Chuan-Sheng and
Ng, See-Kiong and
Low, Bryan Kian Hsiang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1219/",
doi = "10.18653/v1/2025.findings-acl.1219",
pages = "23791--23824",
ISBN = "979-8-89176-256-5",
abstract = "The impressive performances of Large Language Models (LLMs) and their immense potential for commercialization have given rise to serious concerns over the Intellectual Property (IP) of their training data. In particular, the synthetic texts generated by LLMs may infringe the IP of the data being used to train the LLMs. To this end, it is imperative to be able to perform source attribution by identifying the data provider who contributed to the generation of a synthetic text by an LLM. In this paper, we show that this problem can be tackled by watermarking, i.e., by enabling an LLM to generate synthetic texts with embedded watermarks that contain information about their source(s). We identify the key properties of such watermarking frameworks (e.g., source attribution accuracy, robustness against adversaries), and propose a source attribution framework that satisfies these key properties due to our algorithmic designs. Our framework enables an LLM to learn an accurate mapping from the generated texts to data providers, which sets the foundation for effective source attribution. Extensive empirical evaluations show that our framework achieves effective source attribution."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lu-etal-2025-wasa">
<titleInfo>
<title>WASA: WAtermark-based Source Attribution for Large Language Model-Generated Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinyang</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingtan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zitong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongxiang</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuan-Sheng</namePart>
<namePart type="family">Foo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">See-Kiong</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="given">Kian</namePart>
<namePart type="given">Hsiang</namePart>
<namePart type="family">Low</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>The impressive performances of Large Language Models (LLMs) and their immense potential for commercialization have given rise to serious concerns over the Intellectual Property (IP) of their training data. In particular, the synthetic texts generated by LLMs may infringe the IP of the data being used to train the LLMs. To this end, it is imperative to be able to perform source attribution by identifying the data provider who contributed to the generation of a synthetic text by an LLM. In this paper, we show that this problem can be tackled by watermarking, i.e., by enabling an LLM to generate synthetic texts with embedded watermarks that contain information about their source(s). We identify the key properties of such watermarking frameworks (e.g., source attribution accuracy, robustness against adversaries), and propose a source attribution framework that satisfies these key properties due to our algorithmic designs. Our framework enables an LLM to learn an accurate mapping from the generated texts to data providers, which sets the foundation for effective source attribution. Extensive empirical evaluations show that our framework achieves effective source attribution.</abstract>
<identifier type="citekey">lu-etal-2025-wasa</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1219</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1219/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>23791</start>
<end>23824</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WASA: WAtermark-based Source Attribution for Large Language Model-Generated Data
%A Lu, Xinyang
%A Wang, Jingtan
%A Zhao, Zitong
%A Dai, Zhongxiang
%A Foo, Chuan-Sheng
%A Ng, See-Kiong
%A Low, Bryan Kian Hsiang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F lu-etal-2025-wasa
%X The impressive performances of Large Language Models (LLMs) and their immense potential for commercialization have given rise to serious concerns over the Intellectual Property (IP) of their training data. In particular, the synthetic texts generated by LLMs may infringe the IP of the data being used to train the LLMs. To this end, it is imperative to be able to perform source attribution by identifying the data provider who contributed to the generation of a synthetic text by an LLM. In this paper, we show that this problem can be tackled by watermarking, i.e., by enabling an LLM to generate synthetic texts with embedded watermarks that contain information about their source(s). We identify the key properties of such watermarking frameworks (e.g., source attribution accuracy, robustness against adversaries), and propose a source attribution framework that satisfies these key properties due to our algorithmic designs. Our framework enables an LLM to learn an accurate mapping from the generated texts to data providers, which sets the foundation for effective source attribution. Extensive empirical evaluations show that our framework achieves effective source attribution.
%R 10.18653/v1/2025.findings-acl.1219
%U https://aclanthology.org/2025.findings-acl.1219/
%U https://doi.org/10.18653/v1/2025.findings-acl.1219
%P 23791-23824
Markdown (Informal)
[WASA: WAtermark-based Source Attribution for Large Language Model-Generated Data](https://aclanthology.org/2025.findings-acl.1219/) (Lu et al., Findings 2025)
ACL