@inproceedings{behnke-etal-2021-efficient,
title = "Efficient Machine Translation with Model Pruning and Quantization",
author = "Behnke, Maximiliana and
Bogoychev, Nikolay and
Aji, Alham Fikri and
Heafield, Kenneth and
Nail, Graeme and
Zhu, Qianqian and
Tchistiakova, Svetlana and
van der Linde, Jelmer and
Chen, Pinzhen and
Kashyap, Sidharth and
Grundkiewicz, Roman",
booktitle = "Proceedings of the Sixth Conference on Machine Translation",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wmt-1.74",
pages = "775--780",
abstract = "We participated in all tracks of the WMT 2021 efficient machine translation task: single-core CPU, multi-core CPU, and GPU hardware with throughput and latency conditions. Our submissions combine several efficiency strategies: knowledge distillation, a simpler simple recurrent unit (SSRU) decoder with one or two layers, lexical shortlists, smaller numerical formats, and pruning. For the CPU track, we used quantized 8-bit models. For the GPU track, we experimented with FP16 and 8-bit integers in tensorcores. Some of our submissions optimize for size via 4-bit log quantization and omitting a lexical shortlist. We have extended pruning to more parts of the network, emphasizing component- and block-level pruning that actually improves speed unlike coefficient-wise pruning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="behnke-etal-2021-efficient">
<titleInfo>
<title>Efficient Machine Translation with Model Pruning and Quantization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maximiliana</namePart>
<namePart type="family">Behnke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Bogoychev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Heafield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graeme</namePart>
<namePart type="family">Nail</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qianqian</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Svetlana</namePart>
<namePart type="family">Tchistiakova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jelmer</namePart>
<namePart type="family">van der Linde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sidharth</namePart>
<namePart type="family">Kashyap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Grundkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We participated in all tracks of the WMT 2021 efficient machine translation task: single-core CPU, multi-core CPU, and GPU hardware with throughput and latency conditions. Our submissions combine several efficiency strategies: knowledge distillation, a simpler simple recurrent unit (SSRU) decoder with one or two layers, lexical shortlists, smaller numerical formats, and pruning. For the CPU track, we used quantized 8-bit models. For the GPU track, we experimented with FP16 and 8-bit integers in tensorcores. Some of our submissions optimize for size via 4-bit log quantization and omitting a lexical shortlist. We have extended pruning to more parts of the network, emphasizing component- and block-level pruning that actually improves speed unlike coefficient-wise pruning.</abstract>
<identifier type="citekey">behnke-etal-2021-efficient</identifier>
<location>
<url>https://aclanthology.org/2021.wmt-1.74</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>775</start>
<end>780</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Machine Translation with Model Pruning and Quantization
%A Behnke, Maximiliana
%A Bogoychev, Nikolay
%A Aji, Alham Fikri
%A Heafield, Kenneth
%A Nail, Graeme
%A Zhu, Qianqian
%A Tchistiakova, Svetlana
%A van der Linde, Jelmer
%A Chen, Pinzhen
%A Kashyap, Sidharth
%A Grundkiewicz, Roman
%S Proceedings of the Sixth Conference on Machine Translation
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F behnke-etal-2021-efficient
%X We participated in all tracks of the WMT 2021 efficient machine translation task: single-core CPU, multi-core CPU, and GPU hardware with throughput and latency conditions. Our submissions combine several efficiency strategies: knowledge distillation, a simpler simple recurrent unit (SSRU) decoder with one or two layers, lexical shortlists, smaller numerical formats, and pruning. For the CPU track, we used quantized 8-bit models. For the GPU track, we experimented with FP16 and 8-bit integers in tensorcores. Some of our submissions optimize for size via 4-bit log quantization and omitting a lexical shortlist. We have extended pruning to more parts of the network, emphasizing component- and block-level pruning that actually improves speed unlike coefficient-wise pruning.
%U https://aclanthology.org/2021.wmt-1.74
%P 775-780
Markdown (Informal)
[Efficient Machine Translation with Model Pruning and Quantization](https://aclanthology.org/2021.wmt-1.74) (Behnke et al., WMT 2021)
ACL
- Maximiliana Behnke, Nikolay Bogoychev, Alham Fikri Aji, Kenneth Heafield, Graeme Nail, Qianqian Zhu, Svetlana Tchistiakova, Jelmer van der Linde, Pinzhen Chen, Sidharth Kashyap, and Roman Grundkiewicz. 2021. Efficient Machine Translation with Model Pruning and Quantization. In Proceedings of the Sixth Conference on Machine Translation, pages 775–780, Online. Association for Computational Linguistics.