@inproceedings{mouhammad-etal-2023-crowdsourcing,
title = "Crowdsourcing on Sensitive Data with Privacy-Preserving Text Rewriting",
author = "Mouhammad, Nina and
Daxenberger, Johannes and
Schiller, Benjamin and
Habernal, Ivan",
editor = "Prange, Jakob and
Friedrich, Annemarie",
booktitle = "Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.law-1.8",
doi = "10.18653/v1/2023.law-1.8",
pages = "73--84",
abstract = "Most tasks in NLP require labeled data. Data labeling is often done on crowdsourcing platforms due to scalability reasons. However, publishing data on public platforms can only be done if no privacy-relevant information is included. Textual data often contains sensitive information like person names or locations. In this work, we investigate how removing personally identifiable information (PII) as well as applying differential privacy (DP) rewriting can enable text with privacy-relevant information to be used for crowdsourcing. We find that DP-rewriting before crowdsourcing can preserve privacy while still leading to good label quality for certain tasks and data. PII-removal led to good label quality in all examined tasks, however, there are no privacy guarantees given.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mouhammad-etal-2023-crowdsourcing">
<titleInfo>
<title>Crowdsourcing on Sensitive Data with Privacy-Preserving Text Rewriting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Mouhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Daxenberger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Schiller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Prange</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Annemarie</namePart>
<namePart type="family">Friedrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most tasks in NLP require labeled data. Data labeling is often done on crowdsourcing platforms due to scalability reasons. However, publishing data on public platforms can only be done if no privacy-relevant information is included. Textual data often contains sensitive information like person names or locations. In this work, we investigate how removing personally identifiable information (PII) as well as applying differential privacy (DP) rewriting can enable text with privacy-relevant information to be used for crowdsourcing. We find that DP-rewriting before crowdsourcing can preserve privacy while still leading to good label quality for certain tasks and data. PII-removal led to good label quality in all examined tasks, however, there are no privacy guarantees given.</abstract>
<identifier type="citekey">mouhammad-etal-2023-crowdsourcing</identifier>
<identifier type="doi">10.18653/v1/2023.law-1.8</identifier>
<location>
<url>https://aclanthology.org/2023.law-1.8</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>73</start>
<end>84</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Crowdsourcing on Sensitive Data with Privacy-Preserving Text Rewriting
%A Mouhammad, Nina
%A Daxenberger, Johannes
%A Schiller, Benjamin
%A Habernal, Ivan
%Y Prange, Jakob
%Y Friedrich, Annemarie
%S Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F mouhammad-etal-2023-crowdsourcing
%X Most tasks in NLP require labeled data. Data labeling is often done on crowdsourcing platforms due to scalability reasons. However, publishing data on public platforms can only be done if no privacy-relevant information is included. Textual data often contains sensitive information like person names or locations. In this work, we investigate how removing personally identifiable information (PII) as well as applying differential privacy (DP) rewriting can enable text with privacy-relevant information to be used for crowdsourcing. We find that DP-rewriting before crowdsourcing can preserve privacy while still leading to good label quality for certain tasks and data. PII-removal led to good label quality in all examined tasks, however, there are no privacy guarantees given.
%R 10.18653/v1/2023.law-1.8
%U https://aclanthology.org/2023.law-1.8
%U https://doi.org/10.18653/v1/2023.law-1.8
%P 73-84
Markdown (Informal)
[Crowdsourcing on Sensitive Data with Privacy-Preserving Text Rewriting](https://aclanthology.org/2023.law-1.8) (Mouhammad et al., LAW 2023)
ACL