@inproceedings{jin-etal-2024-align,
title = "Align-to-Distill: Trainable Attention Alignment for Knowledge Distillation in Neural Machine Translation",
author = "Jin, Heegon and
Son, Seonil and
Park, Jemin and
Kim, Youngseok and
Noh, Hyungjong and
Lee, Yeonsoo",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.64",
pages = "722--732",
abstract = "The advent of scalable deep models and large datasets has improved the performance of Neural Machine Translation (NMT). Knowledge Distillation (KD) enhances efficiency by transferring knowledge from a teacher model to a more compact student model. However, KD approaches to Transformer architecture often rely on heuristics, particularly when deciding which teacher layers to distill from. In this paper, we introduce the {``}Align-to-Distill{''} (A2D) strategy, designed to address the feature mapping problem by adaptively aligning student attention heads with their teacher counterparts during training. The Attention Alignment Module (AAM) in A2D performs a dense head-by-head comparison between student and teacher attention heads across layers, turning the combinatorial mapping heuristics into a learning problem. Our experiments show the efficacy of A2D, demonstrating gains of up to +3.61 and +0.63 BLEU points for WMT-2022 De→Dsb and WMT-2014 En→De, respectively, compared to Transformer baselines.The code and data are available at https://github.com/ncsoft/Align-to-Distill.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jin-etal-2024-align">
<titleInfo>
<title>Align-to-Distill: Trainable Attention Alignment for Knowledge Distillation in Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heegon</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seonil</namePart>
<namePart type="family">Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jemin</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngseok</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyungjong</namePart>
<namePart type="family">Noh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yeonsoo</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advent of scalable deep models and large datasets has improved the performance of Neural Machine Translation (NMT). Knowledge Distillation (KD) enhances efficiency by transferring knowledge from a teacher model to a more compact student model. However, KD approaches to Transformer architecture often rely on heuristics, particularly when deciding which teacher layers to distill from. In this paper, we introduce the “Align-to-Distill” (A2D) strategy, designed to address the feature mapping problem by adaptively aligning student attention heads with their teacher counterparts during training. The Attention Alignment Module (AAM) in A2D performs a dense head-by-head comparison between student and teacher attention heads across layers, turning the combinatorial mapping heuristics into a learning problem. Our experiments show the efficacy of A2D, demonstrating gains of up to +3.61 and +0.63 BLEU points for WMT-2022 De→Dsb and WMT-2014 En→De, respectively, compared to Transformer baselines.The code and data are available at https://github.com/ncsoft/Align-to-Distill.</abstract>
<identifier type="citekey">jin-etal-2024-align</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.64</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>722</start>
<end>732</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Align-to-Distill: Trainable Attention Alignment for Knowledge Distillation in Neural Machine Translation
%A Jin, Heegon
%A Son, Seonil
%A Park, Jemin
%A Kim, Youngseok
%A Noh, Hyungjong
%A Lee, Yeonsoo
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F jin-etal-2024-align
%X The advent of scalable deep models and large datasets has improved the performance of Neural Machine Translation (NMT). Knowledge Distillation (KD) enhances efficiency by transferring knowledge from a teacher model to a more compact student model. However, KD approaches to Transformer architecture often rely on heuristics, particularly when deciding which teacher layers to distill from. In this paper, we introduce the “Align-to-Distill” (A2D) strategy, designed to address the feature mapping problem by adaptively aligning student attention heads with their teacher counterparts during training. The Attention Alignment Module (AAM) in A2D performs a dense head-by-head comparison between student and teacher attention heads across layers, turning the combinatorial mapping heuristics into a learning problem. Our experiments show the efficacy of A2D, demonstrating gains of up to +3.61 and +0.63 BLEU points for WMT-2022 De→Dsb and WMT-2014 En→De, respectively, compared to Transformer baselines.The code and data are available at https://github.com/ncsoft/Align-to-Distill.
%U https://aclanthology.org/2024.lrec-main.64
%P 722-732
Markdown (Informal)
[Align-to-Distill: Trainable Attention Alignment for Knowledge Distillation in Neural Machine Translation](https://aclanthology.org/2024.lrec-main.64) (Jin et al., LREC-COLING 2024)
ACL