@inproceedings{ren-etal-2023-tailoring,
title = "Tailoring Instructions to Student{'}s Learning Levels Boosts Knowledge Distillation",
author = "Ren, Yuxin and
Zhong, Zihan and
Shi, Xingjian and
Zhu, Yi and
Yuan, Chun and
Li, Mu",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.111",
doi = "10.18653/v1/2023.acl-long.111",
pages = "1990--2006",
abstract = "It has been commonly observed that a teacher model with superior performance does not necessarily result in a stronger student, highlighting a discrepancy between current teacher training practices and effective knowledge transfer. In order to enhance the guidance of the teacher training process, we introduce the concept of distillation influence to determine the impact of distillation from each training sample on the student{'}s generalization ability. In this paper, we propose Learning Good Teacher Matters (LGTM), an efficient training technique for incorporating distillation influence into the teacher{'}s learning process. By prioritizing samples that are likely to enhance the student{'}s generalization ability, our LGTM outperforms 10 common knowledge distillation baselines on 6 text classification tasks in the GLUE benchmark.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ren-etal-2023-tailoring">
<titleInfo>
<title>Tailoring Instructions to Student’s Learning Levels Boosts Knowledge Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuxin</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingjian</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chun</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It has been commonly observed that a teacher model with superior performance does not necessarily result in a stronger student, highlighting a discrepancy between current teacher training practices and effective knowledge transfer. In order to enhance the guidance of the teacher training process, we introduce the concept of distillation influence to determine the impact of distillation from each training sample on the student’s generalization ability. In this paper, we propose Learning Good Teacher Matters (LGTM), an efficient training technique for incorporating distillation influence into the teacher’s learning process. By prioritizing samples that are likely to enhance the student’s generalization ability, our LGTM outperforms 10 common knowledge distillation baselines on 6 text classification tasks in the GLUE benchmark.</abstract>
<identifier type="citekey">ren-etal-2023-tailoring</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.111</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.111</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>1990</start>
<end>2006</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tailoring Instructions to Student’s Learning Levels Boosts Knowledge Distillation
%A Ren, Yuxin
%A Zhong, Zihan
%A Shi, Xingjian
%A Zhu, Yi
%A Yuan, Chun
%A Li, Mu
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F ren-etal-2023-tailoring
%X It has been commonly observed that a teacher model with superior performance does not necessarily result in a stronger student, highlighting a discrepancy between current teacher training practices and effective knowledge transfer. In order to enhance the guidance of the teacher training process, we introduce the concept of distillation influence to determine the impact of distillation from each training sample on the student’s generalization ability. In this paper, we propose Learning Good Teacher Matters (LGTM), an efficient training technique for incorporating distillation influence into the teacher’s learning process. By prioritizing samples that are likely to enhance the student’s generalization ability, our LGTM outperforms 10 common knowledge distillation baselines on 6 text classification tasks in the GLUE benchmark.
%R 10.18653/v1/2023.acl-long.111
%U https://aclanthology.org/2023.acl-long.111
%U https://doi.org/10.18653/v1/2023.acl-long.111
%P 1990-2006
Markdown (Informal)
[Tailoring Instructions to Student’s Learning Levels Boosts Knowledge Distillation](https://aclanthology.org/2023.acl-long.111) (Ren et al., ACL 2023)
ACL