@inproceedings{xu-etal-2025-self-distillation,
title = "A Self-Distillation Recipe for Neural Machine Translation",
author = "Xu, Hongfei and
Liang, Zhuofei and
Liu, Qiuhui and
Mu, Lingling",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.261/",
doi = "10.18653/v1/2025.findings-acl.261",
pages = "5050--5064",
ISBN = "979-8-89176-256-5",
abstract = "Self-distillation distills the deeper sub-networks to the shallower sub-networks without using an extra teacher model, and has been proven effective in improving the performance of a series of computer vision tasks. In this paper, we study the representation-based self-distillation methods for Neural Machine Translation (NMT) considering the efficiency issue with a large vocabulary. We present a rank-order augmented Pearson correlation loss and an iterative distillation method to prevent the discrepancy of predictions between the student and a stronger teacher from disturbing the training. To prevent the teacher from misleading the student{'}s learning, we utilize a warm-up strategy and present a gradient adaption method to scale down or zero the Knowledge Distillation (KD) gradients which are opposite to the translation. Experiments show that our method can lead to significant improvements over the strong Transformer baseline on low/middle/high-resource tasks, obtaining comparable performance to previous MT KD studies without pre-training a teacher. Deeper Transformer experiments show that our method can lead to comparable or better performance with fewer layers."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2025-self-distillation">
<titleInfo>
<title>A Self-Distillation Recipe for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongfei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuofei</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiuhui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lingling</namePart>
<namePart type="family">Mu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Self-distillation distills the deeper sub-networks to the shallower sub-networks without using an extra teacher model, and has been proven effective in improving the performance of a series of computer vision tasks. In this paper, we study the representation-based self-distillation methods for Neural Machine Translation (NMT) considering the efficiency issue with a large vocabulary. We present a rank-order augmented Pearson correlation loss and an iterative distillation method to prevent the discrepancy of predictions between the student and a stronger teacher from disturbing the training. To prevent the teacher from misleading the student’s learning, we utilize a warm-up strategy and present a gradient adaption method to scale down or zero the Knowledge Distillation (KD) gradients which are opposite to the translation. Experiments show that our method can lead to significant improvements over the strong Transformer baseline on low/middle/high-resource tasks, obtaining comparable performance to previous MT KD studies without pre-training a teacher. Deeper Transformer experiments show that our method can lead to comparable or better performance with fewer layers.</abstract>
<identifier type="citekey">xu-etal-2025-self-distillation</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.261</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.261/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5050</start>
<end>5064</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Self-Distillation Recipe for Neural Machine Translation
%A Xu, Hongfei
%A Liang, Zhuofei
%A Liu, Qiuhui
%A Mu, Lingling
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F xu-etal-2025-self-distillation
%X Self-distillation distills the deeper sub-networks to the shallower sub-networks without using an extra teacher model, and has been proven effective in improving the performance of a series of computer vision tasks. In this paper, we study the representation-based self-distillation methods for Neural Machine Translation (NMT) considering the efficiency issue with a large vocabulary. We present a rank-order augmented Pearson correlation loss and an iterative distillation method to prevent the discrepancy of predictions between the student and a stronger teacher from disturbing the training. To prevent the teacher from misleading the student’s learning, we utilize a warm-up strategy and present a gradient adaption method to scale down or zero the Knowledge Distillation (KD) gradients which are opposite to the translation. Experiments show that our method can lead to significant improvements over the strong Transformer baseline on low/middle/high-resource tasks, obtaining comparable performance to previous MT KD studies without pre-training a teacher. Deeper Transformer experiments show that our method can lead to comparable or better performance with fewer layers.
%R 10.18653/v1/2025.findings-acl.261
%U https://aclanthology.org/2025.findings-acl.261/
%U https://doi.org/10.18653/v1/2025.findings-acl.261
%P 5050-5064
Markdown (Informal)
[A Self-Distillation Recipe for Neural Machine Translation](https://aclanthology.org/2025.findings-acl.261/) (Xu et al., Findings 2025)
ACL