@inproceedings{szawerna-etal-2024-detecting,
title = "Detecting Personal Identifiable Information in {S}wedish Learner Essays",
author = {Szawerna, Maria Irena and
Dobnik, Simon and
Mu{\~n}oz S{\'a}nchez, Ricardo and
Lindstr{\"o}m Tiedemann, Therese and
Volodina, Elena},
editor = {Volodina, Elena and
Alfter, David and
Dobnik, Simon and
Lindstr{\"o}m Tiedemann, Therese and
Mu{\~n}oz S{\'a}nchez, Ricardo and
Szawerna, Maria Irena and
Vu, Xuan-Son},
booktitle = "Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.caldpseudo-1.7/",
pages = "54--63",
abstract = "Linguistic data can {---} and often does {---} contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="szawerna-etal-2024-detecting">
<titleInfo>
<title>Detecting Personal Identifiable Information in Swedish Learner Essays</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Irena</namePart>
<namePart type="family">Szawerna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Muñoz Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Therese</namePart>
<namePart type="family">Lindström Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Volodina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Volodina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Alfter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Therese</namePart>
<namePart type="family">Lindström Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Muñoz Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Irena</namePart>
<namePart type="family">Szawerna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuan-Son</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.</abstract>
<identifier type="citekey">szawerna-etal-2024-detecting</identifier>
<location>
<url>https://aclanthology.org/2024.caldpseudo-1.7/</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>54</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting Personal Identifiable Information in Swedish Learner Essays
%A Szawerna, Maria Irena
%A Dobnik, Simon
%A Muñoz Sánchez, Ricardo
%A Lindström Tiedemann, Therese
%A Volodina, Elena
%Y Volodina, Elena
%Y Alfter, David
%Y Dobnik, Simon
%Y Lindström Tiedemann, Therese
%Y Muñoz Sánchez, Ricardo
%Y Szawerna, Maria Irena
%Y Vu, Xuan-Son
%S Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F szawerna-etal-2024-detecting
%X Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.
%U https://aclanthology.org/2024.caldpseudo-1.7/
%P 54-63
Markdown (Informal)
[Detecting Personal Identifiable Information in Swedish Learner Essays](https://aclanthology.org/2024.caldpseudo-1.7/) (Szawerna et al., CALD-pseudo 2024)
ACL
- Maria Irena Szawerna, Simon Dobnik, Ricardo Muñoz Sánchez, Therese Lindström Tiedemann, and Elena Volodina. 2024. Detecting Personal Identifiable Information in Swedish Learner Essays. In Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), pages 54–63, St. Julian’s, Malta. Association for Computational Linguistics.