@inproceedings{zhao-etal-2021-multi,
title = "Multi-split Reversible Transformers Can Enhance Neural Machine Translation",
author = "Zhao, Yuekai and
Zhou, Shuchang and
Zhang, Zhihua",
editor = "Merlo, Paola and
Tiedemann, Jorg and
Tsarfaty, Reut",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.19",
doi = "10.18653/v1/2021.eacl-main.19",
pages = "244--254",
abstract = "Large-scale transformers have been shown the state-of-the-art on neural machine translation. However, training these increasingly wider and deeper models could be tremendously memory intensive. We reduce the memory burden by employing the idea of reversible networks that a layer{'}s input can be reconstructed from its output. We design three types of multi-split based reversible transformers. We also devise a corresponding backpropagation algorithm, which does not need to store activations for most layers. Furthermore, we present two fine-tuning techniques: splits shuffle and self ensemble, to boost translation accuracy. Specifically, our best models surpass the vanilla transformer by at least 1.4 BLEU points in three datasets. Our large-scale reversible models achieve 30.0 BLEU in WMT{'}14 En-De and 43.5 BLEU in WMT{'}14 En-Fr, beating several very strong baselines with less than half of the training memory.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2021-multi">
<titleInfo>
<title>Multi-split Reversible Transformers Can Enhance Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuekai</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuchang</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhihua</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large-scale transformers have been shown the state-of-the-art on neural machine translation. However, training these increasingly wider and deeper models could be tremendously memory intensive. We reduce the memory burden by employing the idea of reversible networks that a layer’s input can be reconstructed from its output. We design three types of multi-split based reversible transformers. We also devise a corresponding backpropagation algorithm, which does not need to store activations for most layers. Furthermore, we present two fine-tuning techniques: splits shuffle and self ensemble, to boost translation accuracy. Specifically, our best models surpass the vanilla transformer by at least 1.4 BLEU points in three datasets. Our large-scale reversible models achieve 30.0 BLEU in WMT’14 En-De and 43.5 BLEU in WMT’14 En-Fr, beating several very strong baselines with less than half of the training memory.</abstract>
<identifier type="citekey">zhao-etal-2021-multi</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.19</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.19</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>244</start>
<end>254</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-split Reversible Transformers Can Enhance Neural Machine Translation
%A Zhao, Yuekai
%A Zhou, Shuchang
%A Zhang, Zhihua
%Y Merlo, Paola
%Y Tiedemann, Jorg
%Y Tsarfaty, Reut
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 April
%I Association for Computational Linguistics
%C Online
%F zhao-etal-2021-multi
%X Large-scale transformers have been shown the state-of-the-art on neural machine translation. However, training these increasingly wider and deeper models could be tremendously memory intensive. We reduce the memory burden by employing the idea of reversible networks that a layer’s input can be reconstructed from its output. We design three types of multi-split based reversible transformers. We also devise a corresponding backpropagation algorithm, which does not need to store activations for most layers. Furthermore, we present two fine-tuning techniques: splits shuffle and self ensemble, to boost translation accuracy. Specifically, our best models surpass the vanilla transformer by at least 1.4 BLEU points in three datasets. Our large-scale reversible models achieve 30.0 BLEU in WMT’14 En-De and 43.5 BLEU in WMT’14 En-Fr, beating several very strong baselines with less than half of the training memory.
%R 10.18653/v1/2021.eacl-main.19
%U https://aclanthology.org/2021.eacl-main.19
%U https://doi.org/10.18653/v1/2021.eacl-main.19
%P 244-254
Markdown (Informal)
[Multi-split Reversible Transformers Can Enhance Neural Machine Translation](https://aclanthology.org/2021.eacl-main.19) (Zhao et al., EACL 2021)
ACL