@inproceedings{anil-etal-2022-large,
title = "Large-Scale Differentially Private {BERT}",
author = "Anil, Rohan and
Ghazi, Badih and
Gupta, Vineet and
Kumar, Ravi and
Manurangsi, Pasin",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.484",
doi = "10.18653/v1/2022.findings-emnlp.484",
pages = "6481--6491",
abstract = "In this work, we study the large-scale pretraining of BERT-Large (Devlin et al., 2019) with differentially private SGD (DP-SGD). We show that combined with a careful implementation, scaling up the batch size to millions (i.e., mega-batches) improves the utility of the DP-SGD step for BERT; we also enhance the training efficiency by using an increasing batch size schedule. Our implementation builds on the recent work of Subramani et al (2020), who demonstrated that the overhead of a DP-SGD step is minimized with effective use of JAX (Bradbury et al., 2018; Frostig et al., 2018) primitives in conjunction with the XLA compiler (XLA team and collaborators, 2017). Our implementation achieves a masked language model accuracy of 60.5{\%} at a batch size of 2M, for epsilon=5, which is a reasonable privacy setting. To put this number in perspective, non-private BERT models achieve an accuracy of ∼70{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anil-etal-2022-large">
<titleInfo>
<title>Large-Scale Differentially Private BERT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rohan</namePart>
<namePart type="family">Anil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badih</namePart>
<namePart type="family">Ghazi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vineet</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ravi</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pasin</namePart>
<namePart type="family">Manurangsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we study the large-scale pretraining of BERT-Large (Devlin et al., 2019) with differentially private SGD (DP-SGD). We show that combined with a careful implementation, scaling up the batch size to millions (i.e., mega-batches) improves the utility of the DP-SGD step for BERT; we also enhance the training efficiency by using an increasing batch size schedule. Our implementation builds on the recent work of Subramani et al (2020), who demonstrated that the overhead of a DP-SGD step is minimized with effective use of JAX (Bradbury et al., 2018; Frostig et al., 2018) primitives in conjunction with the XLA compiler (XLA team and collaborators, 2017). Our implementation achieves a masked language model accuracy of 60.5% at a batch size of 2M, for epsilon=5, which is a reasonable privacy setting. To put this number in perspective, non-private BERT models achieve an accuracy of ∼70%.</abstract>
<identifier type="citekey">anil-etal-2022-large</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.484</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.484</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>6481</start>
<end>6491</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large-Scale Differentially Private BERT
%A Anil, Rohan
%A Ghazi, Badih
%A Gupta, Vineet
%A Kumar, Ravi
%A Manurangsi, Pasin
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F anil-etal-2022-large
%X In this work, we study the large-scale pretraining of BERT-Large (Devlin et al., 2019) with differentially private SGD (DP-SGD). We show that combined with a careful implementation, scaling up the batch size to millions (i.e., mega-batches) improves the utility of the DP-SGD step for BERT; we also enhance the training efficiency by using an increasing batch size schedule. Our implementation builds on the recent work of Subramani et al (2020), who demonstrated that the overhead of a DP-SGD step is minimized with effective use of JAX (Bradbury et al., 2018; Frostig et al., 2018) primitives in conjunction with the XLA compiler (XLA team and collaborators, 2017). Our implementation achieves a masked language model accuracy of 60.5% at a batch size of 2M, for epsilon=5, which is a reasonable privacy setting. To put this number in perspective, non-private BERT models achieve an accuracy of ∼70%.
%R 10.18653/v1/2022.findings-emnlp.484
%U https://aclanthology.org/2022.findings-emnlp.484
%U https://doi.org/10.18653/v1/2022.findings-emnlp.484
%P 6481-6491
Markdown (Informal)
[Large-Scale Differentially Private BERT](https://aclanthology.org/2022.findings-emnlp.484) (Anil et al., Findings 2022)
ACL
- Rohan Anil, Badih Ghazi, Vineet Gupta, Ravi Kumar, and Pasin Manurangsi. 2022. Large-Scale Differentially Private BERT. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 6481–6491, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.