@inproceedings{saadi-etal-2023-learn,
title = "Learn From One Specialized Sub-Teacher: One-to-One Mapping for Feature-Based Knowledge Distillation",
author = "Saadi, Khouloud and
Mitrovi{\'c}, Jelena and
Granitzer, Michael",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.882",
doi = "10.18653/v1/2023.findings-emnlp.882",
pages = "13235--13245",
abstract = "Knowledge distillation is known as an effective technique for compressing over-parameterized language models. In this work, we propose to break down the global feature distillation task into N local sub-tasks. In this new framework, we consider each neuron in the last hidden layer of the teacher network as a specialized sub-teacher. We also consider each neuron in the last hidden layer of the student network as a focused sub-student. We make each focused sub-student learn from one corresponding specialized sub-teacher and ignore the others. This will facilitate the task for the sub-student and keep it focused. Our proposed method is novel and can be combined with other distillation techniques. Empirical results show that our proposed approach outperforms the state-of-the-art methods by maintaining higher performance on most benchmark datasets. Furthermore, we propose a randomized variant of our approach, called Masked One-to-One Mapping. Rather than learning all the N sub-tasks simultaneously, we focus on learning a subset of these sub-tasks at each optimization step. This variant enables the student to digest the received flow of knowledge more effectively and yields superior results.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saadi-etal-2023-learn">
<titleInfo>
<title>Learn From One Specialized Sub-Teacher: One-to-One Mapping for Feature-Based Knowledge Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Khouloud</namePart>
<namePart type="family">Saadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jelena</namePart>
<namePart type="family">Mitrović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Granitzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Knowledge distillation is known as an effective technique for compressing over-parameterized language models. In this work, we propose to break down the global feature distillation task into N local sub-tasks. In this new framework, we consider each neuron in the last hidden layer of the teacher network as a specialized sub-teacher. We also consider each neuron in the last hidden layer of the student network as a focused sub-student. We make each focused sub-student learn from one corresponding specialized sub-teacher and ignore the others. This will facilitate the task for the sub-student and keep it focused. Our proposed method is novel and can be combined with other distillation techniques. Empirical results show that our proposed approach outperforms the state-of-the-art methods by maintaining higher performance on most benchmark datasets. Furthermore, we propose a randomized variant of our approach, called Masked One-to-One Mapping. Rather than learning all the N sub-tasks simultaneously, we focus on learning a subset of these sub-tasks at each optimization step. This variant enables the student to digest the received flow of knowledge more effectively and yields superior results.</abstract>
<identifier type="citekey">saadi-etal-2023-learn</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.882</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.882</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>13235</start>
<end>13245</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learn From One Specialized Sub-Teacher: One-to-One Mapping for Feature-Based Knowledge Distillation
%A Saadi, Khouloud
%A Mitrović, Jelena
%A Granitzer, Michael
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F saadi-etal-2023-learn
%X Knowledge distillation is known as an effective technique for compressing over-parameterized language models. In this work, we propose to break down the global feature distillation task into N local sub-tasks. In this new framework, we consider each neuron in the last hidden layer of the teacher network as a specialized sub-teacher. We also consider each neuron in the last hidden layer of the student network as a focused sub-student. We make each focused sub-student learn from one corresponding specialized sub-teacher and ignore the others. This will facilitate the task for the sub-student and keep it focused. Our proposed method is novel and can be combined with other distillation techniques. Empirical results show that our proposed approach outperforms the state-of-the-art methods by maintaining higher performance on most benchmark datasets. Furthermore, we propose a randomized variant of our approach, called Masked One-to-One Mapping. Rather than learning all the N sub-tasks simultaneously, we focus on learning a subset of these sub-tasks at each optimization step. This variant enables the student to digest the received flow of knowledge more effectively and yields superior results.
%R 10.18653/v1/2023.findings-emnlp.882
%U https://aclanthology.org/2023.findings-emnlp.882
%U https://doi.org/10.18653/v1/2023.findings-emnlp.882
%P 13235-13245
Markdown (Informal)
[Learn From One Specialized Sub-Teacher: One-to-One Mapping for Feature-Based Knowledge Distillation](https://aclanthology.org/2023.findings-emnlp.882) (Saadi et al., Findings 2023)
ACL