@inproceedings{kandpal-etal-2024-user,
title = "User Inference Attacks on Large Language Models",
author = "Kandpal, Nikhil and
Pillutla, Krishna and
Oprea, Alina and
Kairouz, Peter and
Choquette-Choo, Christopher and
Xu, Zheng",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1014",
pages = "18238--18265",
abstract = "Text written by humans makes up the vast majority of the data used to pre-train and fine-tune large language models (LLMs). Many sources of this data{---}like code, forum posts, personal websites, and books{---}are easily attributed to one or a few {``}users{''}. In this paper, we ask if it is possible to infer if any of a {\_}user{'}s{\_} data was used to train an LLM. Not only would this constitute a breach of privacy, but it would also enable users to detect when their data was used for training. We develop the first effective attacks for {\_}user inference{\_}{---}at times, with near-perfect success{---}against LLMs. Our attacks are easy to employ, requiring only black-box access to an LLM and a few samples from the user, which {\_}need not be the ones that were trained on{\_}. We find, both theoretically and empirically, that certain properties make users more susceptible to user inference: being an outlier, having highly correlated examples, and contributing a larger fraction of data. Based on these findings, we identify several methods for mitigating user inference including training with example-level differential privacy, removing within-user duplicate examples, and reducing a user{'}s contribution to the training data. Though these provide partial mitigation, our work highlights the need to develop methods to fully protect LLMs from user inference.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kandpal-etal-2024-user">
<titleInfo>
<title>User Inference Attacks on Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Kandpal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishna</namePart>
<namePart type="family">Pillutla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alina</namePart>
<namePart type="family">Oprea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Kairouz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Choquette-Choo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text written by humans makes up the vast majority of the data used to pre-train and fine-tune large language models (LLMs). Many sources of this data—like code, forum posts, personal websites, and books—are easily attributed to one or a few “users”. In this paper, we ask if it is possible to infer if any of a _user’s_ data was used to train an LLM. Not only would this constitute a breach of privacy, but it would also enable users to detect when their data was used for training. We develop the first effective attacks for _user inference_—at times, with near-perfect success—against LLMs. Our attacks are easy to employ, requiring only black-box access to an LLM and a few samples from the user, which _need not be the ones that were trained on_. We find, both theoretically and empirically, that certain properties make users more susceptible to user inference: being an outlier, having highly correlated examples, and contributing a larger fraction of data. Based on these findings, we identify several methods for mitigating user inference including training with example-level differential privacy, removing within-user duplicate examples, and reducing a user’s contribution to the training data. Though these provide partial mitigation, our work highlights the need to develop methods to fully protect LLMs from user inference.</abstract>
<identifier type="citekey">kandpal-etal-2024-user</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1014</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>18238</start>
<end>18265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T User Inference Attacks on Large Language Models
%A Kandpal, Nikhil
%A Pillutla, Krishna
%A Oprea, Alina
%A Kairouz, Peter
%A Choquette-Choo, Christopher
%A Xu, Zheng
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kandpal-etal-2024-user
%X Text written by humans makes up the vast majority of the data used to pre-train and fine-tune large language models (LLMs). Many sources of this data—like code, forum posts, personal websites, and books—are easily attributed to one or a few “users”. In this paper, we ask if it is possible to infer if any of a _user’s_ data was used to train an LLM. Not only would this constitute a breach of privacy, but it would also enable users to detect when their data was used for training. We develop the first effective attacks for _user inference_—at times, with near-perfect success—against LLMs. Our attacks are easy to employ, requiring only black-box access to an LLM and a few samples from the user, which _need not be the ones that were trained on_. We find, both theoretically and empirically, that certain properties make users more susceptible to user inference: being an outlier, having highly correlated examples, and contributing a larger fraction of data. Based on these findings, we identify several methods for mitigating user inference including training with example-level differential privacy, removing within-user duplicate examples, and reducing a user’s contribution to the training data. Though these provide partial mitigation, our work highlights the need to develop methods to fully protect LLMs from user inference.
%U https://aclanthology.org/2024.emnlp-main.1014
%P 18238-18265
Markdown (Informal)
[User Inference Attacks on Large Language Models](https://aclanthology.org/2024.emnlp-main.1014) (Kandpal et al., EMNLP 2024)
ACL
- Nikhil Kandpal, Krishna Pillutla, Alina Oprea, Peter Kairouz, Christopher Choquette-Choo, and Zheng Xu. 2024. User Inference Attacks on Large Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 18238–18265, Miami, Florida, USA. Association for Computational Linguistics.