@inproceedings{sun-etal-2025-dpga,
title = "{DPGA}-{T}ext{S}yn: Differentially Private Genetic Algorithm for Synthetic Text Generation",
author = "Sun, Zhonghao and
Tian, Zhiliang and
Song, Yiping and
Si, Yuyi and
Zhang, Juhua and
Huang, Minlie and
Lu, Kai and
Xiong, Zeyu and
Liu, Xinwang and
Li, Dongsheng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.831/",
doi = "10.18653/v1/2025.findings-acl.831",
pages = "16159--16179",
ISBN = "979-8-89176-256-5",
abstract = "Using large language models (LLMs) has a potential risk of privacy leakage since the data with sensitive information may be used for fine-tuning the LLMs. Differential privacy (DP) provides theoretical guarantees of privacy protection, but its practical application in LLMs still has the problem of privacy-utility trade-off. Researchers synthesized data with strong generation capabilities closed-source LLMs (i.e., GPT-4) under DP to alleviate this problem, but this method is not so flexible in fitting the given privacy distributions without fine-tuning. Besides, such methods can hardly balance the diversity of synthetic data and its relevance to target privacy data without accessing so much private data. To this end, this paper proposes DPGA-TextSyn, combining general LLMs with genetic algorithm (GA) to produce relevant and diverse synthetic text under DP constraints. First, we integrate the privacy gene (i.e., metadata) to generate better initial samples. Then, to achieve survival of the fittest and avoid homogeneity, we use privacy nearest neighbor voting and similarity suppression to select elite samples. In addition, we expand elite samples via genetic strategies such as mutation, crossover, and generation to expand the search scope of GA. Experiments show that this method significantly improves the performance of the model in downstream tasks while ensuring privacy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2025-dpga">
<titleInfo>
<title>DPGA-TextSyn: Differentially Private Genetic Algorithm for Synthetic Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhonghao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiliang</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiping</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuyi</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juhua</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeyu</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinwang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongsheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Using large language models (LLMs) has a potential risk of privacy leakage since the data with sensitive information may be used for fine-tuning the LLMs. Differential privacy (DP) provides theoretical guarantees of privacy protection, but its practical application in LLMs still has the problem of privacy-utility trade-off. Researchers synthesized data with strong generation capabilities closed-source LLMs (i.e., GPT-4) under DP to alleviate this problem, but this method is not so flexible in fitting the given privacy distributions without fine-tuning. Besides, such methods can hardly balance the diversity of synthetic data and its relevance to target privacy data without accessing so much private data. To this end, this paper proposes DPGA-TextSyn, combining general LLMs with genetic algorithm (GA) to produce relevant and diverse synthetic text under DP constraints. First, we integrate the privacy gene (i.e., metadata) to generate better initial samples. Then, to achieve survival of the fittest and avoid homogeneity, we use privacy nearest neighbor voting and similarity suppression to select elite samples. In addition, we expand elite samples via genetic strategies such as mutation, crossover, and generation to expand the search scope of GA. Experiments show that this method significantly improves the performance of the model in downstream tasks while ensuring privacy.</abstract>
<identifier type="citekey">sun-etal-2025-dpga</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.831</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.831/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16159</start>
<end>16179</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DPGA-TextSyn: Differentially Private Genetic Algorithm for Synthetic Text Generation
%A Sun, Zhonghao
%A Tian, Zhiliang
%A Song, Yiping
%A Si, Yuyi
%A Zhang, Juhua
%A Huang, Minlie
%A Lu, Kai
%A Xiong, Zeyu
%A Liu, Xinwang
%A Li, Dongsheng
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F sun-etal-2025-dpga
%X Using large language models (LLMs) has a potential risk of privacy leakage since the data with sensitive information may be used for fine-tuning the LLMs. Differential privacy (DP) provides theoretical guarantees of privacy protection, but its practical application in LLMs still has the problem of privacy-utility trade-off. Researchers synthesized data with strong generation capabilities closed-source LLMs (i.e., GPT-4) under DP to alleviate this problem, but this method is not so flexible in fitting the given privacy distributions without fine-tuning. Besides, such methods can hardly balance the diversity of synthetic data and its relevance to target privacy data without accessing so much private data. To this end, this paper proposes DPGA-TextSyn, combining general LLMs with genetic algorithm (GA) to produce relevant and diverse synthetic text under DP constraints. First, we integrate the privacy gene (i.e., metadata) to generate better initial samples. Then, to achieve survival of the fittest and avoid homogeneity, we use privacy nearest neighbor voting and similarity suppression to select elite samples. In addition, we expand elite samples via genetic strategies such as mutation, crossover, and generation to expand the search scope of GA. Experiments show that this method significantly improves the performance of the model in downstream tasks while ensuring privacy.
%R 10.18653/v1/2025.findings-acl.831
%U https://aclanthology.org/2025.findings-acl.831/
%U https://doi.org/10.18653/v1/2025.findings-acl.831
%P 16159-16179
Markdown (Informal)
[DPGA-TextSyn: Differentially Private Genetic Algorithm for Synthetic Text Generation](https://aclanthology.org/2025.findings-acl.831/) (Sun et al., Findings 2025)
ACL
- Zhonghao Sun, Zhiliang Tian, Yiping Song, Yuyi Si, Juhua Zhang, Minlie Huang, Kai Lu, Zeyu Xiong, Xinwang Liu, and Dongsheng Li. 2025. DPGA-TextSyn: Differentially Private Genetic Algorithm for Synthetic Text Generation. In Findings of the Association for Computational Linguistics: ACL 2025, pages 16159–16179, Vienna, Austria. Association for Computational Linguistics.