@inproceedings{eder-etal-2022-beste,
title = {{``}Beste Gr{\"u}{\ss}e, Maria Meyer{''} {---} Pseudonymization of Privacy-Sensitive Information in Emails},
author = "Eder, Elisabeth and
Wiegand, Michael and
Krieg-Holz, Ulrike and
Hahn, Udo",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.79",
pages = "741--752",
abstract = "The exploding amount of user-generated content has spurred NLP research to deal with documents from various digital communication formats (tweets, chats, emails, etc.). Using these texts as language resources implies complying with legal data privacy regulations. To protect the personal data of individuals and preclude their identification, we employ pseudonymization. More precisely, we identify those text spans that carry information revealing an individual{'}s identity (e.g., names of persons, locations, phone numbers, or dates) and subsequently substitute them with synthetically generated surrogates. Based on CodE Alltag, a German-language email corpus, we address two tasks. The first task is to evaluate various architectures for the automatic recognition of privacy-sensitive entities in raw data. The second task examines the applicability of pseudonymized data as training data for such systems since models learned on original data cannot be published for reasons of privacy protection. As outputs of both tasks, we, first, generate a new pseudonymized version of CodE Alltag compliant with the legal requirements of the General Data Protection Regulation (GDPR). Second, we make accessible a tagger for recognizing privacy-sensitive information in German emails and similar text genres, which is trained on already pseudonymized data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eder-etal-2022-beste">
<titleInfo>
<title>“Beste Grüße, Maria Meyer” — Pseudonymization of Privacy-Sensitive Information in Emails</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elisabeth</namePart>
<namePart type="family">Eder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Wiegand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulrike</namePart>
<namePart type="family">Krieg-Holz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Udo</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The exploding amount of user-generated content has spurred NLP research to deal with documents from various digital communication formats (tweets, chats, emails, etc.). Using these texts as language resources implies complying with legal data privacy regulations. To protect the personal data of individuals and preclude their identification, we employ pseudonymization. More precisely, we identify those text spans that carry information revealing an individual’s identity (e.g., names of persons, locations, phone numbers, or dates) and subsequently substitute them with synthetically generated surrogates. Based on CodE Alltag, a German-language email corpus, we address two tasks. The first task is to evaluate various architectures for the automatic recognition of privacy-sensitive entities in raw data. The second task examines the applicability of pseudonymized data as training data for such systems since models learned on original data cannot be published for reasons of privacy protection. As outputs of both tasks, we, first, generate a new pseudonymized version of CodE Alltag compliant with the legal requirements of the General Data Protection Regulation (GDPR). Second, we make accessible a tagger for recognizing privacy-sensitive information in German emails and similar text genres, which is trained on already pseudonymized data.</abstract>
<identifier type="citekey">eder-etal-2022-beste</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.79</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>741</start>
<end>752</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Beste Grüße, Maria Meyer” — Pseudonymization of Privacy-Sensitive Information in Emails
%A Eder, Elisabeth
%A Wiegand, Michael
%A Krieg-Holz, Ulrike
%A Hahn, Udo
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F eder-etal-2022-beste
%X The exploding amount of user-generated content has spurred NLP research to deal with documents from various digital communication formats (tweets, chats, emails, etc.). Using these texts as language resources implies complying with legal data privacy regulations. To protect the personal data of individuals and preclude their identification, we employ pseudonymization. More precisely, we identify those text spans that carry information revealing an individual’s identity (e.g., names of persons, locations, phone numbers, or dates) and subsequently substitute them with synthetically generated surrogates. Based on CodE Alltag, a German-language email corpus, we address two tasks. The first task is to evaluate various architectures for the automatic recognition of privacy-sensitive entities in raw data. The second task examines the applicability of pseudonymized data as training data for such systems since models learned on original data cannot be published for reasons of privacy protection. As outputs of both tasks, we, first, generate a new pseudonymized version of CodE Alltag compliant with the legal requirements of the General Data Protection Regulation (GDPR). Second, we make accessible a tagger for recognizing privacy-sensitive information in German emails and similar text genres, which is trained on already pseudonymized data.
%U https://aclanthology.org/2022.lrec-1.79
%P 741-752
Markdown (Informal)
[“Beste Grüße, Maria Meyer” — Pseudonymization of Privacy-Sensitive Information in Emails](https://aclanthology.org/2022.lrec-1.79) (Eder et al., LREC 2022)
ACL