@inproceedings{satvaty-etal-2025-memorization,
title = "Memorization is Language-Sensitive: Analyzing Memorization and Inference Risks of {LLM}s in a Multilingual Setting",
author = "Satvaty, Ali and
Visman, Anna and
Seidel, Dan and
Verberne, Suzan and
Turkmen, Fatih",
editor = "Jia, Robin and
Wallace, Eric and
Huang, Yangsibo and
Pimentel, Tiago and
Maini, Pratyush and
Dankers, Verna and
Wei, Johnny and
Lesci, Pietro",
booktitle = "Proceedings of the First Workshop on Large Language Model Memorization (L2M2)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.l2m2-1.9/",
doi = "10.18653/v1/2025.l2m2-1.9",
pages = "106--126",
ISBN = "979-8-89176-278-7",
abstract = "Large Language Models (LLMs) are known to memorize and reproduce parts of their training data during inference, raising significant privacy and safety concerns. While this phenomenon has been extensively studied to explain its contributing factors and countermeasures, its implications in multilingual contexts remain largely unexplored.In this work, we investigate cross-lingual differences in memorization behaviors of multilingual LLMs.Specifically, we examine both discoverable memorization and susceptibility to perplexity ratio attacks using Pythia models of varying sizes, evaluated on two parallel multilingual datasets.Our results reveal that lower-resource languages consistently exhibit higher vulnerability to perplexity ratio attacks, indicating greater privacy risks. In contrast, patterns of discoverable memorization appear to be influenced more strongly by the model{'}s pretraining or fine-tuning phases than by language resource level alone.These findings highlight the nuanced interplay between language resource availability and memorization in multilingual LLMs, providing insights toward developing safer and more privacy-preserving language models across diverse linguistic settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="satvaty-etal-2025-memorization">
<titleInfo>
<title>Memorization is Language-Sensitive: Analyzing Memorization and Inference Risks of LLMs in a Multilingual Setting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Satvaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Visman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Seidel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suzan</namePart>
<namePart type="family">Verberne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatih</namePart>
<namePart type="family">Turkmen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Large Language Model Memorization (L2M2)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Wallace</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangsibo</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Maini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verna</namePart>
<namePart type="family">Dankers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pietro</namePart>
<namePart type="family">Lesci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-278-7</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) are known to memorize and reproduce parts of their training data during inference, raising significant privacy and safety concerns. While this phenomenon has been extensively studied to explain its contributing factors and countermeasures, its implications in multilingual contexts remain largely unexplored.In this work, we investigate cross-lingual differences in memorization behaviors of multilingual LLMs.Specifically, we examine both discoverable memorization and susceptibility to perplexity ratio attacks using Pythia models of varying sizes, evaluated on two parallel multilingual datasets.Our results reveal that lower-resource languages consistently exhibit higher vulnerability to perplexity ratio attacks, indicating greater privacy risks. In contrast, patterns of discoverable memorization appear to be influenced more strongly by the model’s pretraining or fine-tuning phases than by language resource level alone.These findings highlight the nuanced interplay between language resource availability and memorization in multilingual LLMs, providing insights toward developing safer and more privacy-preserving language models across diverse linguistic settings.</abstract>
<identifier type="citekey">satvaty-etal-2025-memorization</identifier>
<identifier type="doi">10.18653/v1/2025.l2m2-1.9</identifier>
<location>
<url>https://aclanthology.org/2025.l2m2-1.9/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>106</start>
<end>126</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Memorization is Language-Sensitive: Analyzing Memorization and Inference Risks of LLMs in a Multilingual Setting
%A Satvaty, Ali
%A Visman, Anna
%A Seidel, Dan
%A Verberne, Suzan
%A Turkmen, Fatih
%Y Jia, Robin
%Y Wallace, Eric
%Y Huang, Yangsibo
%Y Pimentel, Tiago
%Y Maini, Pratyush
%Y Dankers, Verna
%Y Wei, Johnny
%Y Lesci, Pietro
%S Proceedings of the First Workshop on Large Language Model Memorization (L2M2)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-278-7
%F satvaty-etal-2025-memorization
%X Large Language Models (LLMs) are known to memorize and reproduce parts of their training data during inference, raising significant privacy and safety concerns. While this phenomenon has been extensively studied to explain its contributing factors and countermeasures, its implications in multilingual contexts remain largely unexplored.In this work, we investigate cross-lingual differences in memorization behaviors of multilingual LLMs.Specifically, we examine both discoverable memorization and susceptibility to perplexity ratio attacks using Pythia models of varying sizes, evaluated on two parallel multilingual datasets.Our results reveal that lower-resource languages consistently exhibit higher vulnerability to perplexity ratio attacks, indicating greater privacy risks. In contrast, patterns of discoverable memorization appear to be influenced more strongly by the model’s pretraining or fine-tuning phases than by language resource level alone.These findings highlight the nuanced interplay between language resource availability and memorization in multilingual LLMs, providing insights toward developing safer and more privacy-preserving language models across diverse linguistic settings.
%R 10.18653/v1/2025.l2m2-1.9
%U https://aclanthology.org/2025.l2m2-1.9/
%U https://doi.org/10.18653/v1/2025.l2m2-1.9
%P 106-126
Markdown (Informal)
[Memorization is Language-Sensitive: Analyzing Memorization and Inference Risks of LLMs in a Multilingual Setting](https://aclanthology.org/2025.l2m2-1.9/) (Satvaty et al., L2M2 2025)
ACL