@inproceedings{hu-etal-2025-simple,
title = "Simple Yet Effective: Extracting Private Data Across Clients in Federated Fine-Tuning of Large Language Models",
author = "Hu, Yingqi and
Zhang, Zhuo and
Zhang, Jingyuan and
Wang, Jinghua and
Wang, Qifan and
Qu, Lizhen and
Xu, Zenglin",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-ijcnlp.113/",
pages = "1808--1827",
ISBN = "979-8-89176-303-6",
abstract = "Federated large language models (FedLLMs) enable cross-silo collaborative training among institutions while preserving data locality, making them appealing for privacy-sensitive domains such as law, finance, and healthcare. However, the memorization behavior of LLMs can lead to privacy risks that may cause cross-client data leakage. In this work, we study the threat of *cross-client data extraction*, where a semi-honest participant attempts to recover personally identifiable information (PII) memorized from other clients' data. We propose three simple yet effective extraction strategies that leverage contextual prefixes from the attacker{'}s local data, including frequency-based prefix sampling and local fine-tuning to amplify memorization. To evaluate these attacks, we construct a Chinese legal-domain dataset with fine-grained PII annotations consistent with CPIS, GDPR, and CCPA standards, and assess extraction performance using two metrics: *coverage* and *efficiency*. Experimental results show that our methods can recover up to 56.6{\%} of victim-exclusive PII, where names, addresses, and birthdays are particularly vulnerable. These findings highlight concrete privacy risks in FedLLMs and establish a benchmark and evaluation framework for future research on privacy-preserving federated learning. Code and data are available at https://github.com/SMILELab-FL/FedPII."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2025-simple">
<titleInfo>
<title>Simple Yet Effective: Extracting Private Data Across Clients in Federated Fine-Tuning of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yingqi</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinghua</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qifan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lizhen</namePart>
<namePart type="family">Qu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zenglin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haofen</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biplab</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhirendra</namePart>
<namePart type="given">Pratap</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Asian Federation of Natural Language Processing and The Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-303-6</identifier>
</relatedItem>
<abstract>Federated large language models (FedLLMs) enable cross-silo collaborative training among institutions while preserving data locality, making them appealing for privacy-sensitive domains such as law, finance, and healthcare. However, the memorization behavior of LLMs can lead to privacy risks that may cause cross-client data leakage. In this work, we study the threat of *cross-client data extraction*, where a semi-honest participant attempts to recover personally identifiable information (PII) memorized from other clients’ data. We propose three simple yet effective extraction strategies that leverage contextual prefixes from the attacker’s local data, including frequency-based prefix sampling and local fine-tuning to amplify memorization. To evaluate these attacks, we construct a Chinese legal-domain dataset with fine-grained PII annotations consistent with CPIS, GDPR, and CCPA standards, and assess extraction performance using two metrics: *coverage* and *efficiency*. Experimental results show that our methods can recover up to 56.6% of victim-exclusive PII, where names, addresses, and birthdays are particularly vulnerable. These findings highlight concrete privacy risks in FedLLMs and establish a benchmark and evaluation framework for future research on privacy-preserving federated learning. Code and data are available at https://github.com/SMILELab-FL/FedPII.</abstract>
<identifier type="citekey">hu-etal-2025-simple</identifier>
<location>
<url>https://aclanthology.org/2025.findings-ijcnlp.113/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>1808</start>
<end>1827</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Simple Yet Effective: Extracting Private Data Across Clients in Federated Fine-Tuning of Large Language Models
%A Hu, Yingqi
%A Zhang, Zhuo
%A Zhang, Jingyuan
%A Wang, Jinghua
%A Wang, Qifan
%A Qu, Lizhen
%A Xu, Zenglin
%Y Inui, Kentaro
%Y Sakti, Sakriani
%Y Wang, Haofen
%Y Wong, Derek F.
%Y Bhattacharyya, Pushpak
%Y Banerjee, Biplab
%Y Ekbal, Asif
%Y Chakraborty, Tanmoy
%Y Singh, Dhirendra Pratap
%S Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I The Asian Federation of Natural Language Processing and The Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-303-6
%F hu-etal-2025-simple
%X Federated large language models (FedLLMs) enable cross-silo collaborative training among institutions while preserving data locality, making them appealing for privacy-sensitive domains such as law, finance, and healthcare. However, the memorization behavior of LLMs can lead to privacy risks that may cause cross-client data leakage. In this work, we study the threat of *cross-client data extraction*, where a semi-honest participant attempts to recover personally identifiable information (PII) memorized from other clients’ data. We propose three simple yet effective extraction strategies that leverage contextual prefixes from the attacker’s local data, including frequency-based prefix sampling and local fine-tuning to amplify memorization. To evaluate these attacks, we construct a Chinese legal-domain dataset with fine-grained PII annotations consistent with CPIS, GDPR, and CCPA standards, and assess extraction performance using two metrics: *coverage* and *efficiency*. Experimental results show that our methods can recover up to 56.6% of victim-exclusive PII, where names, addresses, and birthdays are particularly vulnerable. These findings highlight concrete privacy risks in FedLLMs and establish a benchmark and evaluation framework for future research on privacy-preserving federated learning. Code and data are available at https://github.com/SMILELab-FL/FedPII.
%U https://aclanthology.org/2025.findings-ijcnlp.113/
%P 1808-1827
Markdown (Informal)
[Simple Yet Effective: Extracting Private Data Across Clients in Federated Fine-Tuning of Large Language Models](https://aclanthology.org/2025.findings-ijcnlp.113/) (Hu et al., Findings 2025)
ACL