@inproceedings{norouzi-etal-2023-dims,
title = "{D}i{MS}: Distilling Multiple Steps of Iterative Non-Autoregressive Transformers for Machine Translation",
author = "Norouzi, Sajad and
Hosseinzadeh, Rasa and
Perez, Felipe and
Volkovs, Maksims",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.542",
doi = "10.18653/v1/2023.findings-acl.542",
pages = "8538--8553",
abstract = "The computational benefits of iterative non-autoregressive transformers decrease as the number of decoding steps increases. As a remedy, we introduce Distill Multiple Steps (DiMS), a simple yet effective distillation technique to decrease the number of required steps to reach a certain translation quality. The distilled model enjoys the computational benefits of early iterations while preserving the enhancements from several iterative steps. DiMS relies on two models namely student and teacher. The student is optimized to predict the output of the teacher after multiple decoding steps while the teacher follows the student via a slow-moving average. The moving average keeps the teacher{'}s knowledge updated and enhances the quality of the labels provided by the teacher. During inference, the student is used for translation and no additional computation is added. We verify the effectiveness of DiMS on various models obtaining 7.8 and 12.9 BLEU points improvements in single-step translation accuracy on distilled and raw versions of WMT{'}14 De-En.Full code for this work is available here: \url{https://github.com/layer6ai-labs/DiMS}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="norouzi-etal-2023-dims">
<titleInfo>
<title>DiMS: Distilling Multiple Steps of Iterative Non-Autoregressive Transformers for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sajad</namePart>
<namePart type="family">Norouzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rasa</namePart>
<namePart type="family">Hosseinzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Perez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maksims</namePart>
<namePart type="family">Volkovs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The computational benefits of iterative non-autoregressive transformers decrease as the number of decoding steps increases. As a remedy, we introduce Distill Multiple Steps (DiMS), a simple yet effective distillation technique to decrease the number of required steps to reach a certain translation quality. The distilled model enjoys the computational benefits of early iterations while preserving the enhancements from several iterative steps. DiMS relies on two models namely student and teacher. The student is optimized to predict the output of the teacher after multiple decoding steps while the teacher follows the student via a slow-moving average. The moving average keeps the teacher’s knowledge updated and enhances the quality of the labels provided by the teacher. During inference, the student is used for translation and no additional computation is added. We verify the effectiveness of DiMS on various models obtaining 7.8 and 12.9 BLEU points improvements in single-step translation accuracy on distilled and raw versions of WMT’14 De-En.Full code for this work is available here: https://github.com/layer6ai-labs/DiMS</abstract>
<identifier type="citekey">norouzi-etal-2023-dims</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.542</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.542</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>8538</start>
<end>8553</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DiMS: Distilling Multiple Steps of Iterative Non-Autoregressive Transformers for Machine Translation
%A Norouzi, Sajad
%A Hosseinzadeh, Rasa
%A Perez, Felipe
%A Volkovs, Maksims
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F norouzi-etal-2023-dims
%X The computational benefits of iterative non-autoregressive transformers decrease as the number of decoding steps increases. As a remedy, we introduce Distill Multiple Steps (DiMS), a simple yet effective distillation technique to decrease the number of required steps to reach a certain translation quality. The distilled model enjoys the computational benefits of early iterations while preserving the enhancements from several iterative steps. DiMS relies on two models namely student and teacher. The student is optimized to predict the output of the teacher after multiple decoding steps while the teacher follows the student via a slow-moving average. The moving average keeps the teacher’s knowledge updated and enhances the quality of the labels provided by the teacher. During inference, the student is used for translation and no additional computation is added. We verify the effectiveness of DiMS on various models obtaining 7.8 and 12.9 BLEU points improvements in single-step translation accuracy on distilled and raw versions of WMT’14 De-En.Full code for this work is available here: https://github.com/layer6ai-labs/DiMS
%R 10.18653/v1/2023.findings-acl.542
%U https://aclanthology.org/2023.findings-acl.542
%U https://doi.org/10.18653/v1/2023.findings-acl.542
%P 8538-8553
Markdown (Informal)
[DiMS: Distilling Multiple Steps of Iterative Non-Autoregressive Transformers for Machine Translation](https://aclanthology.org/2023.findings-acl.542) (Norouzi et al., Findings 2023)
ACL