@inproceedings{guan-etal-2024-codeip,
title = "{C}ode{IP}: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code",
author = "Guan, Batu and
Wan, Yao and
Bi, Zhangqian and
Wang, Zheng and
Zhang, Hongyu and
Zhou, Pan and
Sun, Lichao",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.541",
pages = "9243--9258",
abstract = "Large Language Models (LLMs) have achieved remarkable progress in code generation. It now becomes crucial to identify whether the code is AI-generated and to determine the specific model used, particularly for purposes such as protecting Intellectual Property (IP) in industry and preventing cheating in programming exercises. To this end, several attempts have been made to insert watermarks into machine-generated code. However, existing approaches are limited to inserting only a single bit of information. In this paper, we introduce CodeIP, a novel multi-bit watermarking technique that embeds additional information to preserve crucial provenance details, such as the vendor ID of an LLM, thereby safeguarding the IPs of LLMs in code generation. Furthermore, to ensure the syntactical correctness of the generated code, we propose constraining the sampling process for predicting the next token by training a type predictor. Experiments conducted on a real-world dataset across five programming languages demonstrate the effectiveness of CodeIP in watermarking LLMs for code generation while maintaining the syntactical correctness of code.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guan-etal-2024-codeip">
<titleInfo>
<title>CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code</title>
</titleInfo>
<name type="personal">
<namePart type="given">Batu</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhangqian</namePart>
<namePart type="family">Bi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lichao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have achieved remarkable progress in code generation. It now becomes crucial to identify whether the code is AI-generated and to determine the specific model used, particularly for purposes such as protecting Intellectual Property (IP) in industry and preventing cheating in programming exercises. To this end, several attempts have been made to insert watermarks into machine-generated code. However, existing approaches are limited to inserting only a single bit of information. In this paper, we introduce CodeIP, a novel multi-bit watermarking technique that embeds additional information to preserve crucial provenance details, such as the vendor ID of an LLM, thereby safeguarding the IPs of LLMs in code generation. Furthermore, to ensure the syntactical correctness of the generated code, we propose constraining the sampling process for predicting the next token by training a type predictor. Experiments conducted on a real-world dataset across five programming languages demonstrate the effectiveness of CodeIP in watermarking LLMs for code generation while maintaining the syntactical correctness of code.</abstract>
<identifier type="citekey">guan-etal-2024-codeip</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.541</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>9243</start>
<end>9258</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code
%A Guan, Batu
%A Wan, Yao
%A Bi, Zhangqian
%A Wang, Zheng
%A Zhang, Hongyu
%A Zhou, Pan
%A Sun, Lichao
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F guan-etal-2024-codeip
%X Large Language Models (LLMs) have achieved remarkable progress in code generation. It now becomes crucial to identify whether the code is AI-generated and to determine the specific model used, particularly for purposes such as protecting Intellectual Property (IP) in industry and preventing cheating in programming exercises. To this end, several attempts have been made to insert watermarks into machine-generated code. However, existing approaches are limited to inserting only a single bit of information. In this paper, we introduce CodeIP, a novel multi-bit watermarking technique that embeds additional information to preserve crucial provenance details, such as the vendor ID of an LLM, thereby safeguarding the IPs of LLMs in code generation. Furthermore, to ensure the syntactical correctness of the generated code, we propose constraining the sampling process for predicting the next token by training a type predictor. Experiments conducted on a real-world dataset across five programming languages demonstrate the effectiveness of CodeIP in watermarking LLMs for code generation while maintaining the syntactical correctness of code.
%U https://aclanthology.org/2024.findings-emnlp.541
%P 9243-9258
Markdown (Informal)
[CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code](https://aclanthology.org/2024.findings-emnlp.541) (Guan et al., Findings 2024)
ACL
- Batu Guan, Yao Wan, Zhangqian Bi, Zheng Wang, Hongyu Zhang, Pan Zhou, and Lichao Sun. 2024. CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 9243–9258, Miami, Florida, USA. Association for Computational Linguistics.