@inproceedings{fetahu-etal-2022-distilling,
title = "Distilling Multilingual Transformers into {CNN}s for Scalable Intent Classification",
author = "Fetahu, Besnik and
Veeragouni, Akash and
Rokhlenko, Oleg and
Malmasi, Shervin",
editor = "Li, Yunyao and
Lazaridou, Angeliki",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-industry.43",
doi = "10.18653/v1/2022.emnlp-industry.43",
pages = "429--439",
abstract = "We describe an application of Knowledge Distillation used to distill and deploy multilingual Transformer models for voice assistants, enabling text classification for customers globally. Transformers have set new state-of-the-art results for tasks like intent classification, and multilingual models exploit cross-lingual transfer to allow serving requests across 100+ languages. However, their prohibitive inference time makes them impractical to deploy in real-world scenarios with low latency requirements, such as is the case of voice assistants. We address the problem of cross-architecture distillation of multilingual Transformers to simpler models, while maintaining multilinguality without performance degradation. Training multilingual student models has received little attention, and is our main focus. We show that a teacher-student framework, where the teacher{'}s unscaled activations (logits) on unlabelled data are used to supervise student model training, enables distillation of Transformers into efficient multilingual CNN models. Our student model achieves equivalent performance as the teacher, and outperforms a similar model trained on the labelled data used to train the teacher model. This approach has enabled us to accurately serve global customer requests at speed (18x improvement), scale, and low cost.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fetahu-etal-2022-distilling">
<titleInfo>
<title>Distilling Multilingual Transformers into CNNs for Scalable Intent Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Besnik</namePart>
<namePart type="family">Fetahu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akash</namePart>
<namePart type="family">Veeragouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Rokhlenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shervin</namePart>
<namePart type="family">Malmasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angeliki</namePart>
<namePart type="family">Lazaridou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe an application of Knowledge Distillation used to distill and deploy multilingual Transformer models for voice assistants, enabling text classification for customers globally. Transformers have set new state-of-the-art results for tasks like intent classification, and multilingual models exploit cross-lingual transfer to allow serving requests across 100+ languages. However, their prohibitive inference time makes them impractical to deploy in real-world scenarios with low latency requirements, such as is the case of voice assistants. We address the problem of cross-architecture distillation of multilingual Transformers to simpler models, while maintaining multilinguality without performance degradation. Training multilingual student models has received little attention, and is our main focus. We show that a teacher-student framework, where the teacher’s unscaled activations (logits) on unlabelled data are used to supervise student model training, enables distillation of Transformers into efficient multilingual CNN models. Our student model achieves equivalent performance as the teacher, and outperforms a similar model trained on the labelled data used to train the teacher model. This approach has enabled us to accurately serve global customer requests at speed (18x improvement), scale, and low cost.</abstract>
<identifier type="citekey">fetahu-etal-2022-distilling</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-industry.43</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-industry.43</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>429</start>
<end>439</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Distilling Multilingual Transformers into CNNs for Scalable Intent Classification
%A Fetahu, Besnik
%A Veeragouni, Akash
%A Rokhlenko, Oleg
%A Malmasi, Shervin
%Y Li, Yunyao
%Y Lazaridou, Angeliki
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F fetahu-etal-2022-distilling
%X We describe an application of Knowledge Distillation used to distill and deploy multilingual Transformer models for voice assistants, enabling text classification for customers globally. Transformers have set new state-of-the-art results for tasks like intent classification, and multilingual models exploit cross-lingual transfer to allow serving requests across 100+ languages. However, their prohibitive inference time makes them impractical to deploy in real-world scenarios with low latency requirements, such as is the case of voice assistants. We address the problem of cross-architecture distillation of multilingual Transformers to simpler models, while maintaining multilinguality without performance degradation. Training multilingual student models has received little attention, and is our main focus. We show that a teacher-student framework, where the teacher’s unscaled activations (logits) on unlabelled data are used to supervise student model training, enables distillation of Transformers into efficient multilingual CNN models. Our student model achieves equivalent performance as the teacher, and outperforms a similar model trained on the labelled data used to train the teacher model. This approach has enabled us to accurately serve global customer requests at speed (18x improvement), scale, and low cost.
%R 10.18653/v1/2022.emnlp-industry.43
%U https://aclanthology.org/2022.emnlp-industry.43
%U https://doi.org/10.18653/v1/2022.emnlp-industry.43
%P 429-439
Markdown (Informal)
[Distilling Multilingual Transformers into CNNs for Scalable Intent Classification](https://aclanthology.org/2022.emnlp-industry.43) (Fetahu et al., EMNLP 2022)
ACL