@inproceedings{zhang-etal-2018-analyzing,
title = "Analyzing Knowledge Distillation in Neural Machine Translation",
author = "Zhang, Dakun and
Crego, Josep and
Senellart, Jean",
editor = "Turchi, Marco and
Niehues, Jan and
Frederico, Marcello",
booktitle = "Proceedings of the 15th International Conference on Spoken Language Translation",
month = oct # " 29-30",
year = "2018",
address = "Brussels",
publisher = "International Conference on Spoken Language Translation",
url = "https://aclanthology.org/2018.iwslt-1.4",
pages = "23--30",
abstract = "Knowledge distillation has recently been successfully applied to neural machine translation. It allows for building shrunk networks while the resulting systems retain most of the quality of the original model. Despite the fact that many authors report on the benefits of knowledge distillation, few have discussed the actual reasons why it works, especially in the context of neural MT. In this paper, we conduct several experiments aimed at understanding why and how distillation impacts accuracy on an English-German translation task. We show that translation complexity is actually reduced when building a distilled/synthesised bi-text when compared to the reference bi-text. We further remove noisy data from synthesised translations and merge filtered synthesised data together with original reference, thus achieving additional gains in terms of accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2018-analyzing">
<titleInfo>
<title>Analyzing Knowledge Distillation in Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dakun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josep</namePart>
<namePart type="family">Crego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean</namePart>
<namePart type="family">Senellart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-oct 29-30</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Spoken Language Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Turchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Niehues</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Frederico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Spoken Language Translation</publisher>
<place>
<placeTerm type="text">Brussels</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Knowledge distillation has recently been successfully applied to neural machine translation. It allows for building shrunk networks while the resulting systems retain most of the quality of the original model. Despite the fact that many authors report on the benefits of knowledge distillation, few have discussed the actual reasons why it works, especially in the context of neural MT. In this paper, we conduct several experiments aimed at understanding why and how distillation impacts accuracy on an English-German translation task. We show that translation complexity is actually reduced when building a distilled/synthesised bi-text when compared to the reference bi-text. We further remove noisy data from synthesised translations and merge filtered synthesised data together with original reference, thus achieving additional gains in terms of accuracy.</abstract>
<identifier type="citekey">zhang-etal-2018-analyzing</identifier>
<location>
<url>https://aclanthology.org/2018.iwslt-1.4</url>
</location>
<part>
<date>2018-oct 29-30</date>
<extent unit="page">
<start>23</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analyzing Knowledge Distillation in Neural Machine Translation
%A Zhang, Dakun
%A Crego, Josep
%A Senellart, Jean
%Y Turchi, Marco
%Y Niehues, Jan
%Y Frederico, Marcello
%S Proceedings of the 15th International Conference on Spoken Language Translation
%D 2018
%8 oct 29 30
%I International Conference on Spoken Language Translation
%C Brussels
%F zhang-etal-2018-analyzing
%X Knowledge distillation has recently been successfully applied to neural machine translation. It allows for building shrunk networks while the resulting systems retain most of the quality of the original model. Despite the fact that many authors report on the benefits of knowledge distillation, few have discussed the actual reasons why it works, especially in the context of neural MT. In this paper, we conduct several experiments aimed at understanding why and how distillation impacts accuracy on an English-German translation task. We show that translation complexity is actually reduced when building a distilled/synthesised bi-text when compared to the reference bi-text. We further remove noisy data from synthesised translations and merge filtered synthesised data together with original reference, thus achieving additional gains in terms of accuracy.
%U https://aclanthology.org/2018.iwslt-1.4
%P 23-30
Markdown (Informal)
[Analyzing Knowledge Distillation in Neural Machine Translation](https://aclanthology.org/2018.iwslt-1.4) (Zhang et al., IWSLT 2018)
ACL