@inproceedings{ponomareva-etal-2022-training-text,
title = "Training Text-to-Text Transformers with Privacy Guarantees",
author = "Ponomareva, Natalia and
Bastings, Jasmijn and
Vassilvitskii, Sergei",
editor = "Feyisetan, Oluwaseyi and
Ghanavati, Sepideh and
Thaine, Patricia and
Habernal, Ivan and
Mireshghallah, Fatemehsadat",
booktitle = "Proceedings of the Fourth Workshop on Privacy in Natural Language Processing",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.privatenlp-1.4/",
doi = "10.18653/v1/2022.privatenlp-1.4",
pages = "21--21",
abstract = "Recent advances in NLP often stem from large transformer-based pre-trained models, which rapidly grow in size and use more and more training data. Such models are often released to the public so that end users can fine-tune them on a task dataset. While it is common to treat pre-training data as public, it may still contain personally identifiable information (PII), such as names, phone numbers, and copyrighted material. Recent findings show that the capacity of these models allows them to memorize parts of the training data, and suggest differentially private (DP) training as a potential mitigation. While there is recent work on DP fine-tuning of NLP models, the effects of DP pre-training are less well understood it is not clear how downstream performance is affected by DP pre-training, and whether DP pre-training mitigates some of the memorization concerns. We focus on T5 and show that by using recent advances in JAX and XLA we can train models with DP that do not suffer a large drop in pre-training utility, nor in training speed, and can still be fine-tuned to high accuracy on downstream tasks (e.g. GLUE). Moreover, we show that T5s span corruption is a good defense against data memorization."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ponomareva-etal-2022-training-text">
<titleInfo>
<title>Training Text-to-Text Transformers with Privacy Guarantees</title>
</titleInfo>
<name type="personal">
<namePart type="given">Natalia</namePart>
<namePart type="family">Ponomareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasmijn</namePart>
<namePart type="family">Bastings</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergei</namePart>
<namePart type="family">Vassilvitskii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Privacy in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oluwaseyi</namePart>
<namePart type="family">Feyisetan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sepideh</namePart>
<namePart type="family">Ghanavati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patricia</namePart>
<namePart type="family">Thaine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatemehsadat</namePart>
<namePart type="family">Mireshghallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advances in NLP often stem from large transformer-based pre-trained models, which rapidly grow in size and use more and more training data. Such models are often released to the public so that end users can fine-tune them on a task dataset. While it is common to treat pre-training data as public, it may still contain personally identifiable information (PII), such as names, phone numbers, and copyrighted material. Recent findings show that the capacity of these models allows them to memorize parts of the training data, and suggest differentially private (DP) training as a potential mitigation. While there is recent work on DP fine-tuning of NLP models, the effects of DP pre-training are less well understood it is not clear how downstream performance is affected by DP pre-training, and whether DP pre-training mitigates some of the memorization concerns. We focus on T5 and show that by using recent advances in JAX and XLA we can train models with DP that do not suffer a large drop in pre-training utility, nor in training speed, and can still be fine-tuned to high accuracy on downstream tasks (e.g. GLUE). Moreover, we show that T5s span corruption is a good defense against data memorization.</abstract>
<identifier type="citekey">ponomareva-etal-2022-training-text</identifier>
<identifier type="doi">10.18653/v1/2022.privatenlp-1.4</identifier>
<location>
<url>https://aclanthology.org/2022.privatenlp-1.4/</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>21</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training Text-to-Text Transformers with Privacy Guarantees
%A Ponomareva, Natalia
%A Bastings, Jasmijn
%A Vassilvitskii, Sergei
%Y Feyisetan, Oluwaseyi
%Y Ghanavati, Sepideh
%Y Thaine, Patricia
%Y Habernal, Ivan
%Y Mireshghallah, Fatemehsadat
%S Proceedings of the Fourth Workshop on Privacy in Natural Language Processing
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F ponomareva-etal-2022-training-text
%X Recent advances in NLP often stem from large transformer-based pre-trained models, which rapidly grow in size and use more and more training data. Such models are often released to the public so that end users can fine-tune them on a task dataset. While it is common to treat pre-training data as public, it may still contain personally identifiable information (PII), such as names, phone numbers, and copyrighted material. Recent findings show that the capacity of these models allows them to memorize parts of the training data, and suggest differentially private (DP) training as a potential mitigation. While there is recent work on DP fine-tuning of NLP models, the effects of DP pre-training are less well understood it is not clear how downstream performance is affected by DP pre-training, and whether DP pre-training mitigates some of the memorization concerns. We focus on T5 and show that by using recent advances in JAX and XLA we can train models with DP that do not suffer a large drop in pre-training utility, nor in training speed, and can still be fine-tuned to high accuracy on downstream tasks (e.g. GLUE). Moreover, we show that T5s span corruption is a good defense against data memorization.
%R 10.18653/v1/2022.privatenlp-1.4
%U https://aclanthology.org/2022.privatenlp-1.4/
%U https://doi.org/10.18653/v1/2022.privatenlp-1.4
%P 21-21
Markdown (Informal)
[Training Text-to-Text Transformers with Privacy Guarantees](https://aclanthology.org/2022.privatenlp-1.4/) (Ponomareva et al., PrivateNLP 2022)
ACL
- Natalia Ponomareva, Jasmijn Bastings, and Sergei Vassilvitskii. 2022. Training Text-to-Text Transformers with Privacy Guarantees. In Proceedings of the Fourth Workshop on Privacy in Natural Language Processing, pages 21–21, Seattle, United States. Association for Computational Linguistics.