@inproceedings{yang-etal-2022-nearest,
title = "Nearest Neighbor Knowledge Distillation for Neural Machine Translation",
author = "Yang, Zhixian and
Sun, Renliang and
Wan, Xiaojun",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.406",
doi = "10.18653/v1/2022.naacl-main.406",
pages = "5546--5556",
abstract = "k-nearest-neighbor machine translation ($k$NN-MT), proposed by Khandelwal et al. (2021), has achieved many state-of-the-art results in machine translation tasks. Although effective, $k$NN-MT requires conducting $k$NN searches through the large datastore for each decoding step during inference, prohibitively increasing the decoding cost and thus leading to the difficulty for the deployment in real-world applications. In this paper, we propose to move the time-consuming $k$NN search forward to the preprocessing phase, and then introduce $k$ Nearest Neighbor Knowledge Distillation ($k$NN-KD) that trains the base NMT model to directly learn the knowledge of $k$NN. Distilling knowledge retrieved by $k$NN can encourage the NMT model to take more reasonable target tokens into consideration, thus addressing the overcorrection problem. Extensive experimental results show that, the proposed method achieves consistent improvement over the state-of-the-art baselines including $k$NN-MT, while maintaining the same training and decoding speed as the standard NMT model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2022-nearest">
<titleInfo>
<title>Nearest Neighbor Knowledge Distillation for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhixian</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Renliang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>k-nearest-neighbor machine translation (kNN-MT), proposed by Khandelwal et al. (2021), has achieved many state-of-the-art results in machine translation tasks. Although effective, kNN-MT requires conducting kNN searches through the large datastore for each decoding step during inference, prohibitively increasing the decoding cost and thus leading to the difficulty for the deployment in real-world applications. In this paper, we propose to move the time-consuming kNN search forward to the preprocessing phase, and then introduce k Nearest Neighbor Knowledge Distillation (kNN-KD) that trains the base NMT model to directly learn the knowledge of kNN. Distilling knowledge retrieved by kNN can encourage the NMT model to take more reasonable target tokens into consideration, thus addressing the overcorrection problem. Extensive experimental results show that, the proposed method achieves consistent improvement over the state-of-the-art baselines including kNN-MT, while maintaining the same training and decoding speed as the standard NMT model.</abstract>
<identifier type="citekey">yang-etal-2022-nearest</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-main.406</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-main.406</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>5546</start>
<end>5556</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Nearest Neighbor Knowledge Distillation for Neural Machine Translation
%A Yang, Zhixian
%A Sun, Renliang
%A Wan, Xiaojun
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F yang-etal-2022-nearest
%X k-nearest-neighbor machine translation (kNN-MT), proposed by Khandelwal et al. (2021), has achieved many state-of-the-art results in machine translation tasks. Although effective, kNN-MT requires conducting kNN searches through the large datastore for each decoding step during inference, prohibitively increasing the decoding cost and thus leading to the difficulty for the deployment in real-world applications. In this paper, we propose to move the time-consuming kNN search forward to the preprocessing phase, and then introduce k Nearest Neighbor Knowledge Distillation (kNN-KD) that trains the base NMT model to directly learn the knowledge of kNN. Distilling knowledge retrieved by kNN can encourage the NMT model to take more reasonable target tokens into consideration, thus addressing the overcorrection problem. Extensive experimental results show that, the proposed method achieves consistent improvement over the state-of-the-art baselines including kNN-MT, while maintaining the same training and decoding speed as the standard NMT model.
%R 10.18653/v1/2022.naacl-main.406
%U https://aclanthology.org/2022.naacl-main.406
%U https://doi.org/10.18653/v1/2022.naacl-main.406
%P 5546-5556
Markdown (Informal)
[Nearest Neighbor Knowledge Distillation for Neural Machine Translation](https://aclanthology.org/2022.naacl-main.406) (Yang et al., NAACL 2022)
ACL
- Zhixian Yang, Renliang Sun, and Xiaojun Wan. 2022. Nearest Neighbor Knowledge Distillation for Neural Machine Translation. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 5546–5556, Seattle, United States. Association for Computational Linguistics.