@inproceedings{suzuki-etal-2025-aligning,
title = "Aligning Sizes of Intermediate Layers by {L}o{RA} Adapter for Knowledge Distillation",
author = "Suzuki, Takeshi and
Yamada, Hiroaki and
Tokunaga, Takenobu",
editor = "Drozd, Aleksandr and
Sedoc, Jo{\~a}o and
Tafreshi, Shabnam and
Akula, Arjun and
Shu, Raphael",
booktitle = "The Sixth Workshop on Insights from Negative Results in NLP",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.insights-1.10/",
doi = "10.18653/v1/2025.insights-1.10",
pages = "100--105",
ISBN = "979-8-89176-240-4",
abstract = "Intermediate Layer Distillation (ILD) is a variant of Knowledge Distillation (KD), a method for compressing neural networks.ILD requires mapping to align the intermediate layer sizes of the teacher and student models to compute the loss function in training, while this mapping is not used during inference.This inconsistency may reduce the effectiveness of learning in intermediate layers.In this study, we propose LoRAILD, which uses LoRA adapters to eliminate the inconsistency.However, our experimental results show that LoRAILD does not outperform existing methods.Furthermore, contrary to previous studies, we observe that conventional ILD does not outperform vanilla KD.Our analysis of the distilled models' intermediate layers suggests that ILD does not improve language models' performance."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="suzuki-etal-2025-aligning">
<titleInfo>
<title>Aligning Sizes of Intermediate Layers by LoRA Adapter for Knowledge Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takeshi</namePart>
<namePart type="family">Suzuki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroaki</namePart>
<namePart type="family">Yamada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takenobu</namePart>
<namePart type="family">Tokunaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The Sixth Workshop on Insights from Negative Results in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleksandr</namePart>
<namePart type="family">Drozd</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shabnam</namePart>
<namePart type="family">Tafreshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Akula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Shu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-240-4</identifier>
</relatedItem>
<abstract>Intermediate Layer Distillation (ILD) is a variant of Knowledge Distillation (KD), a method for compressing neural networks.ILD requires mapping to align the intermediate layer sizes of the teacher and student models to compute the loss function in training, while this mapping is not used during inference.This inconsistency may reduce the effectiveness of learning in intermediate layers.In this study, we propose LoRAILD, which uses LoRA adapters to eliminate the inconsistency.However, our experimental results show that LoRAILD does not outperform existing methods.Furthermore, contrary to previous studies, we observe that conventional ILD does not outperform vanilla KD.Our analysis of the distilled models’ intermediate layers suggests that ILD does not improve language models’ performance.</abstract>
<identifier type="citekey">suzuki-etal-2025-aligning</identifier>
<identifier type="doi">10.18653/v1/2025.insights-1.10</identifier>
<location>
<url>https://aclanthology.org/2025.insights-1.10/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>100</start>
<end>105</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Aligning Sizes of Intermediate Layers by LoRA Adapter for Knowledge Distillation
%A Suzuki, Takeshi
%A Yamada, Hiroaki
%A Tokunaga, Takenobu
%Y Drozd, Aleksandr
%Y Sedoc, João
%Y Tafreshi, Shabnam
%Y Akula, Arjun
%Y Shu, Raphael
%S The Sixth Workshop on Insights from Negative Results in NLP
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-240-4
%F suzuki-etal-2025-aligning
%X Intermediate Layer Distillation (ILD) is a variant of Knowledge Distillation (KD), a method for compressing neural networks.ILD requires mapping to align the intermediate layer sizes of the teacher and student models to compute the loss function in training, while this mapping is not used during inference.This inconsistency may reduce the effectiveness of learning in intermediate layers.In this study, we propose LoRAILD, which uses LoRA adapters to eliminate the inconsistency.However, our experimental results show that LoRAILD does not outperform existing methods.Furthermore, contrary to previous studies, we observe that conventional ILD does not outperform vanilla KD.Our analysis of the distilled models’ intermediate layers suggests that ILD does not improve language models’ performance.
%R 10.18653/v1/2025.insights-1.10
%U https://aclanthology.org/2025.insights-1.10/
%U https://doi.org/10.18653/v1/2025.insights-1.10
%P 100-105
Markdown (Informal)
[Aligning Sizes of Intermediate Layers by LoRA Adapter for Knowledge Distillation](https://aclanthology.org/2025.insights-1.10/) (Suzuki et al., insights 2025)
ACL