@inproceedings{fan-etal-2025-maximizing,
title = "Maximizing the Effectiveness of Larger {BERT} Models for Compression",
author = "Fan, Wen-Shu and
Lu, Su and
Xing, Shangyu and
Li, Xin-Chun and
Zhan, De-Chuan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1067/",
doi = "10.18653/v1/2025.acl-long.1067",
pages = "21975--21990",
ISBN = "979-8-89176-251-0",
abstract = "Knowledge distillation (KD) is a widely used approach for BERT compression, where a larger BERT model serves as a teacher to transfer knowledge to a smaller student model. Prior works have found that distilling a larger BERT with superior performance may degrade student{'}s performance than a smaller BERT. In this paper, we investigate the limitations of existing KD methods for larger BERT models. Through Canonical Correlation Analysis, we identify that these methods fail to fully exploit the potential advantages of larger teachers. To address this, we propose an improved distillation approach that effectively enhances knowledge transfer. Comprehensive experiments demonstrate the effectiveness of our method in enabling larger BERT models to distill knowledge more efficiently."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fan-etal-2025-maximizing">
<titleInfo>
<title>Maximizing the Effectiveness of Larger BERT Models for Compression</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wen-Shu</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Su</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shangyu</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin-Chun</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">De-Chuan</namePart>
<namePart type="family">Zhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Knowledge distillation (KD) is a widely used approach for BERT compression, where a larger BERT model serves as a teacher to transfer knowledge to a smaller student model. Prior works have found that distilling a larger BERT with superior performance may degrade student’s performance than a smaller BERT. In this paper, we investigate the limitations of existing KD methods for larger BERT models. Through Canonical Correlation Analysis, we identify that these methods fail to fully exploit the potential advantages of larger teachers. To address this, we propose an improved distillation approach that effectively enhances knowledge transfer. Comprehensive experiments demonstrate the effectiveness of our method in enabling larger BERT models to distill knowledge more efficiently.</abstract>
<identifier type="citekey">fan-etal-2025-maximizing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1067</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1067/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>21975</start>
<end>21990</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Maximizing the Effectiveness of Larger BERT Models for Compression
%A Fan, Wen-Shu
%A Lu, Su
%A Xing, Shangyu
%A Li, Xin-Chun
%A Zhan, De-Chuan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F fan-etal-2025-maximizing
%X Knowledge distillation (KD) is a widely used approach for BERT compression, where a larger BERT model serves as a teacher to transfer knowledge to a smaller student model. Prior works have found that distilling a larger BERT with superior performance may degrade student’s performance than a smaller BERT. In this paper, we investigate the limitations of existing KD methods for larger BERT models. Through Canonical Correlation Analysis, we identify that these methods fail to fully exploit the potential advantages of larger teachers. To address this, we propose an improved distillation approach that effectively enhances knowledge transfer. Comprehensive experiments demonstrate the effectiveness of our method in enabling larger BERT models to distill knowledge more efficiently.
%R 10.18653/v1/2025.acl-long.1067
%U https://aclanthology.org/2025.acl-long.1067/
%U https://doi.org/10.18653/v1/2025.acl-long.1067
%P 21975-21990
Markdown (Informal)
[Maximizing the Effectiveness of Larger BERT Models for Compression](https://aclanthology.org/2025.acl-long.1067/) (Fan et al., ACL 2025)
ACL
- Wen-Shu Fan, Su Lu, Shangyu Xing, Xin-Chun Li, and De-Chuan Zhan. 2025. Maximizing the Effectiveness of Larger BERT Models for Compression. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 21975–21990, Vienna, Austria. Association for Computational Linguistics.