@inproceedings{rashid-etal-2021-towards,
title = "Towards Zero-Shot Knowledge Distillation for Natural Language Processing",
author = "Rashid, Ahmad and
Lioutas, Vasileios and
Ghaddar, Abbas and
Rezagholizadeh, Mehdi",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.526",
doi = "10.18653/v1/2021.emnlp-main.526",
pages = "6551--6561",
abstract = "Knowledge distillation (KD) is a common knowledge transfer algorithm used for model compression across a variety of deep learning based natural language processing (NLP) solutions. In its regular manifestations, KD requires access to the teacher{'}s training data for knowledge transfer to the student network. However, privacy concerns, data regulations and proprietary reasons may prevent access to such data. We present, to the best of our knowledge, the first work on Zero-shot Knowledge Distillation for NLP, where the student learns from the much larger teacher without any task specific data. Our solution combines out-of-domain data and adversarial training to learn the teacher{'}s output distribution. We investigate six tasks from the GLUE benchmark and demonstrate that we can achieve between 75{\%} and 92{\%} of the teacher{'}s classification score (accuracy or F1) while compressing the model 30 times.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rashid-etal-2021-towards">
<titleInfo>
<title>Towards Zero-Shot Knowledge Distillation for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Rashid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasileios</namePart>
<namePart type="family">Lioutas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abbas</namePart>
<namePart type="family">Ghaddar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Rezagholizadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Knowledge distillation (KD) is a common knowledge transfer algorithm used for model compression across a variety of deep learning based natural language processing (NLP) solutions. In its regular manifestations, KD requires access to the teacher’s training data for knowledge transfer to the student network. However, privacy concerns, data regulations and proprietary reasons may prevent access to such data. We present, to the best of our knowledge, the first work on Zero-shot Knowledge Distillation for NLP, where the student learns from the much larger teacher without any task specific data. Our solution combines out-of-domain data and adversarial training to learn the teacher’s output distribution. We investigate six tasks from the GLUE benchmark and demonstrate that we can achieve between 75% and 92% of the teacher’s classification score (accuracy or F1) while compressing the model 30 times.</abstract>
<identifier type="citekey">rashid-etal-2021-towards</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-main.526</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-main.526</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>6551</start>
<end>6561</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Zero-Shot Knowledge Distillation for Natural Language Processing
%A Rashid, Ahmad
%A Lioutas, Vasileios
%A Ghaddar, Abbas
%A Rezagholizadeh, Mehdi
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F rashid-etal-2021-towards
%X Knowledge distillation (KD) is a common knowledge transfer algorithm used for model compression across a variety of deep learning based natural language processing (NLP) solutions. In its regular manifestations, KD requires access to the teacher’s training data for knowledge transfer to the student network. However, privacy concerns, data regulations and proprietary reasons may prevent access to such data. We present, to the best of our knowledge, the first work on Zero-shot Knowledge Distillation for NLP, where the student learns from the much larger teacher without any task specific data. Our solution combines out-of-domain data and adversarial training to learn the teacher’s output distribution. We investigate six tasks from the GLUE benchmark and demonstrate that we can achieve between 75% and 92% of the teacher’s classification score (accuracy or F1) while compressing the model 30 times.
%R 10.18653/v1/2021.emnlp-main.526
%U https://aclanthology.org/2021.emnlp-main.526
%U https://doi.org/10.18653/v1/2021.emnlp-main.526
%P 6551-6561
Markdown (Informal)
[Towards Zero-Shot Knowledge Distillation for Natural Language Processing](https://aclanthology.org/2021.emnlp-main.526) (Rashid et al., EMNLP 2021)
ACL