@inproceedings{belkadi-etal-2025-generating,
title = "Generating Synthetic Free-text Medical Records with Low Re-identification Risk using Masked Language Modeling",
author = "Belkadi, Samuel and
Ren, Libo and
Micheletti, Nicolo and
Han, Lifeng and
Nenadic, Goran",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-srw.20/",
doi = "10.18653/v1/2025.naacl-srw.20",
pages = "200--206",
ISBN = "979-8-89176-192-6",
abstract = "The abundance of medical records holds great promise for enhancing healthcare and advancing biomedical research. However, due to \textit{privacy} constraints, access to such data is typically limited to internal use.Recent studies have attempted to overcome this challenge by generating synthetic data through Causal Language Modelling. Yet, this approach often fails to ensure patient anonymity and offers limited control over output diversity{---}unless additional computational cost is introduced.In response, we propose a method for generating synthetic free-text medical records based on \textit{Masked Language Modelling}. Our approach retains key medical details while introducing variability in the generated texts and reducing the risk of patient re-identification. With a relatively lightweight architecture of approximately 120 million parameters, the system ensures low inference costs.Experimental results show that our method produces high-quality synthetic data, achieving a HIPAA-compliant PHI recall of 96{\%} and a re-identification risk of only 3.5{\%}. Furthermore, downstream evaluations reveal that models trained on the synthetic data perform comparably to those trained on real-world data. Our trained models are publicly available on Github as SynDeidMLM (at \url{https://github.com/SamySam0/SynDeidMLM}) (meaning \textbf{syn}thetic and \textbf{de-id}entified data generation using \textbf{MLM})."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="belkadi-etal-2025-generating">
<titleInfo>
<title>Generating Synthetic Free-text Medical Records with Low Re-identification Risk using Masked Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Belkadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Libo</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolo</namePart>
<namePart type="family">Micheletti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lifeng</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Goran</namePart>
<namePart type="family">Nenadic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sammar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Leonor Pacheco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shira</namePart>
<namePart type="family">Wein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-192-6</identifier>
</relatedItem>
<abstract>The abundance of medical records holds great promise for enhancing healthcare and advancing biomedical research. However, due to privacy constraints, access to such data is typically limited to internal use.Recent studies have attempted to overcome this challenge by generating synthetic data through Causal Language Modelling. Yet, this approach often fails to ensure patient anonymity and offers limited control over output diversity—unless additional computational cost is introduced.In response, we propose a method for generating synthetic free-text medical records based on Masked Language Modelling. Our approach retains key medical details while introducing variability in the generated texts and reducing the risk of patient re-identification. With a relatively lightweight architecture of approximately 120 million parameters, the system ensures low inference costs.Experimental results show that our method produces high-quality synthetic data, achieving a HIPAA-compliant PHI recall of 96% and a re-identification risk of only 3.5%. Furthermore, downstream evaluations reveal that models trained on the synthetic data perform comparably to those trained on real-world data. Our trained models are publicly available on Github as SynDeidMLM (at https://github.com/SamySam0/SynDeidMLM) (meaning synthetic and de-identified data generation using MLM).</abstract>
<identifier type="citekey">belkadi-etal-2025-generating</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-srw.20</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-srw.20/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>200</start>
<end>206</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Synthetic Free-text Medical Records with Low Re-identification Risk using Masked Language Modeling
%A Belkadi, Samuel
%A Ren, Libo
%A Micheletti, Nicolo
%A Han, Lifeng
%A Nenadic, Goran
%Y Ebrahimi, Abteen
%Y Haider, Samar
%Y Liu, Emmy
%Y Haider, Sammar
%Y Leonor Pacheco, Maria
%Y Wein, Shira
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-192-6
%F belkadi-etal-2025-generating
%X The abundance of medical records holds great promise for enhancing healthcare and advancing biomedical research. However, due to privacy constraints, access to such data is typically limited to internal use.Recent studies have attempted to overcome this challenge by generating synthetic data through Causal Language Modelling. Yet, this approach often fails to ensure patient anonymity and offers limited control over output diversity—unless additional computational cost is introduced.In response, we propose a method for generating synthetic free-text medical records based on Masked Language Modelling. Our approach retains key medical details while introducing variability in the generated texts and reducing the risk of patient re-identification. With a relatively lightweight architecture of approximately 120 million parameters, the system ensures low inference costs.Experimental results show that our method produces high-quality synthetic data, achieving a HIPAA-compliant PHI recall of 96% and a re-identification risk of only 3.5%. Furthermore, downstream evaluations reveal that models trained on the synthetic data perform comparably to those trained on real-world data. Our trained models are publicly available on Github as SynDeidMLM (at https://github.com/SamySam0/SynDeidMLM) (meaning synthetic and de-identified data generation using MLM).
%R 10.18653/v1/2025.naacl-srw.20
%U https://aclanthology.org/2025.naacl-srw.20/
%U https://doi.org/10.18653/v1/2025.naacl-srw.20
%P 200-206
Markdown (Informal)
[Generating Synthetic Free-text Medical Records with Low Re-identification Risk using Masked Language Modeling](https://aclanthology.org/2025.naacl-srw.20/) (Belkadi et al., NAACL 2025)
ACL