@inproceedings{abdjul-etal-2025-indonesian,
title = "{I}ndonesian Speech Content De-Identification in Low Resource Transcripts",
author = "Abdjul, Rifqi Naufal and
Puji Lestari, Dessi and
Purwarianti, Ayu and
Mawalim, Candy Olivia and
Sakti, Sakriani and
Unoki, Masashi",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the Second Workshop in South East Asian Language Processing",
month = jan,
year = "2025",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sealp-1.6/",
pages = "63--71",
abstract = "Advancements in technology and the increased use of digital data threaten individual privacy, especially in speech containing Personally Identifiable Information (PII). Therefore, systems that can remove or process privacy-sensitive data in speech are needed, particularly for low-resource transcripts. These transcripts are minimally annotated or labeled automatically, which is less precise than human annotation. However, using them can simplify the development of de-identification systems in any language. In this study, we develop and evaluate an efficient speech de-identification system. We create an Indonesian speech dataset containing sensitive private information and design a system with three main components: speech recognition, information extraction, and masking. To enhance performance in low-resource settings, we incorporate transcription data in training, use data augmentation, and apply weakly supervised learning. Our results show that our techniques significantly improve privacy detection performance, with approximately 29{\%} increase in F1 score, 20{\%} in precision, and 30{\%} in recall with minimally labeled data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdjul-etal-2025-indonesian">
<titleInfo>
<title>Indonesian Speech Content De-Identification in Low Resource Transcripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rifqi</namePart>
<namePart type="given">Naufal</namePart>
<namePart type="family">Abdjul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dessi</namePart>
<namePart type="family">Puji Lestari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candy</namePart>
<namePart type="given">Olivia</namePart>
<namePart type="family">Mawalim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masashi</namePart>
<namePart type="family">Unoki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop in South East Asian Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Derry</namePart>
<namePart type="family">Wijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Vania</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="given">Indra</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Advancements in technology and the increased use of digital data threaten individual privacy, especially in speech containing Personally Identifiable Information (PII). Therefore, systems that can remove or process privacy-sensitive data in speech are needed, particularly for low-resource transcripts. These transcripts are minimally annotated or labeled automatically, which is less precise than human annotation. However, using them can simplify the development of de-identification systems in any language. In this study, we develop and evaluate an efficient speech de-identification system. We create an Indonesian speech dataset containing sensitive private information and design a system with three main components: speech recognition, information extraction, and masking. To enhance performance in low-resource settings, we incorporate transcription data in training, use data augmentation, and apply weakly supervised learning. Our results show that our techniques significantly improve privacy detection performance, with approximately 29% increase in F1 score, 20% in precision, and 30% in recall with minimally labeled data.</abstract>
<identifier type="citekey">abdjul-etal-2025-indonesian</identifier>
<location>
<url>https://aclanthology.org/2025.sealp-1.6/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>63</start>
<end>71</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Indonesian Speech Content De-Identification in Low Resource Transcripts
%A Abdjul, Rifqi Naufal
%A Puji Lestari, Dessi
%A Purwarianti, Ayu
%A Mawalim, Candy Olivia
%A Sakti, Sakriani
%A Unoki, Masashi
%Y Wijaya, Derry
%Y Aji, Alham Fikri
%Y Vania, Clara
%Y Winata, Genta Indra
%Y Purwarianti, Ayu
%S Proceedings of the Second Workshop in South East Asian Language Processing
%D 2025
%8 January
%I Association for Computational Linguistics
%C Online
%F abdjul-etal-2025-indonesian
%X Advancements in technology and the increased use of digital data threaten individual privacy, especially in speech containing Personally Identifiable Information (PII). Therefore, systems that can remove or process privacy-sensitive data in speech are needed, particularly for low-resource transcripts. These transcripts are minimally annotated or labeled automatically, which is less precise than human annotation. However, using them can simplify the development of de-identification systems in any language. In this study, we develop and evaluate an efficient speech de-identification system. We create an Indonesian speech dataset containing sensitive private information and design a system with three main components: speech recognition, information extraction, and masking. To enhance performance in low-resource settings, we incorporate transcription data in training, use data augmentation, and apply weakly supervised learning. Our results show that our techniques significantly improve privacy detection performance, with approximately 29% increase in F1 score, 20% in precision, and 30% in recall with minimally labeled data.
%U https://aclanthology.org/2025.sealp-1.6/
%P 63-71
Markdown (Informal)
[Indonesian Speech Content De-Identification in Low Resource Transcripts](https://aclanthology.org/2025.sealp-1.6/) (Abdjul et al., sealp 2025)
ACL