@inproceedings{kassem-etal-2025-alpaca,
title = "{ALPACA} {AGAINST} {VICUNA}: Using {LLM}s to Uncover Memorization of {LLM}s",
author = "Kassem, Aly M. and
Mahmoud, Omar and
Mireshghallah, Niloofar and
Kim, Hyunwoo and
Tsvetkov, Yulia and
Choi, Yejin and
Saad, Sherif and
Rana, Santu",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.421/",
doi = "10.18653/v1/2025.naacl-long.421",
pages = "8296--8321",
ISBN = "979-8-89176-189-6",
abstract = "In this paper, we investigate the overlooked impact of instruction-tuning on memorization in large language models (LLMs), which has largely been studied in base, pre-trained models. We propose a black-box prompt optimization method where an attacker LLM agent uncovers higher levels of memorization in a victim agent, surpassing traditional approaches that prompt the model directly with training data. Using an iterative rejection-sampling process, we design instruction-based prompts that minimize overlap with training data to avoid providing direct solutions while maximizing overlap between the victim{'}s output and the training data to induce memorization. Our method shows 23.7{\%} more overlap with training data compared to state-of-the-art baselines. We explore two attack settings: an analytical approach that determines the empirical upper bound of the attack, both with and without access to responses for prompt initialization, and a practical classifier-based method for assessing memorization without access to memorized data. Our findings reveal that instruction-tuned models can expose pre-training data as much as, or more than, base models; contexts beyond the original training data can lead to leakage; and instructions generated by other LLMs open new avenues for automated attacks, which we believe require further exploration."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kassem-etal-2025-alpaca">
<titleInfo>
<title>ALPACA AGAINST VICUNA: Using LLMs to Uncover Memorization of LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aly</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Kassem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="family">Mahmoud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niloofar</namePart>
<namePart type="family">Mireshghallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunwoo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yejin</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sherif</namePart>
<namePart type="family">Saad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Santu</namePart>
<namePart type="family">Rana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>In this paper, we investigate the overlooked impact of instruction-tuning on memorization in large language models (LLMs), which has largely been studied in base, pre-trained models. We propose a black-box prompt optimization method where an attacker LLM agent uncovers higher levels of memorization in a victim agent, surpassing traditional approaches that prompt the model directly with training data. Using an iterative rejection-sampling process, we design instruction-based prompts that minimize overlap with training data to avoid providing direct solutions while maximizing overlap between the victim’s output and the training data to induce memorization. Our method shows 23.7% more overlap with training data compared to state-of-the-art baselines. We explore two attack settings: an analytical approach that determines the empirical upper bound of the attack, both with and without access to responses for prompt initialization, and a practical classifier-based method for assessing memorization without access to memorized data. Our findings reveal that instruction-tuned models can expose pre-training data as much as, or more than, base models; contexts beyond the original training data can lead to leakage; and instructions generated by other LLMs open new avenues for automated attacks, which we believe require further exploration.</abstract>
<identifier type="citekey">kassem-etal-2025-alpaca</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.421</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.421/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8296</start>
<end>8321</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ALPACA AGAINST VICUNA: Using LLMs to Uncover Memorization of LLMs
%A Kassem, Aly M.
%A Mahmoud, Omar
%A Mireshghallah, Niloofar
%A Kim, Hyunwoo
%A Tsvetkov, Yulia
%A Choi, Yejin
%A Saad, Sherif
%A Rana, Santu
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F kassem-etal-2025-alpaca
%X In this paper, we investigate the overlooked impact of instruction-tuning on memorization in large language models (LLMs), which has largely been studied in base, pre-trained models. We propose a black-box prompt optimization method where an attacker LLM agent uncovers higher levels of memorization in a victim agent, surpassing traditional approaches that prompt the model directly with training data. Using an iterative rejection-sampling process, we design instruction-based prompts that minimize overlap with training data to avoid providing direct solutions while maximizing overlap between the victim’s output and the training data to induce memorization. Our method shows 23.7% more overlap with training data compared to state-of-the-art baselines. We explore two attack settings: an analytical approach that determines the empirical upper bound of the attack, both with and without access to responses for prompt initialization, and a practical classifier-based method for assessing memorization without access to memorized data. Our findings reveal that instruction-tuned models can expose pre-training data as much as, or more than, base models; contexts beyond the original training data can lead to leakage; and instructions generated by other LLMs open new avenues for automated attacks, which we believe require further exploration.
%R 10.18653/v1/2025.naacl-long.421
%U https://aclanthology.org/2025.naacl-long.421/
%U https://doi.org/10.18653/v1/2025.naacl-long.421
%P 8296-8321
Markdown (Informal)
[ALPACA AGAINST VICUNA: Using LLMs to Uncover Memorization of LLMs](https://aclanthology.org/2025.naacl-long.421/) (Kassem et al., NAACL 2025)
ACL
- Aly M. Kassem, Omar Mahmoud, Niloofar Mireshghallah, Hyunwoo Kim, Yulia Tsvetkov, Yejin Choi, Sherif Saad, and Santu Rana. 2025. ALPACA AGAINST VICUNA: Using LLMs to Uncover Memorization of LLMs. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 8296–8321, Albuquerque, New Mexico. Association for Computational Linguistics.