@inproceedings{diddee-etal-2022-brittle,
title = "Too Brittle to Touch: Comparing the Stability of Quantization and Distillation towards Developing Low-Resource {MT} Models",
author = "Diddee, Harshita and
Dandapat, Sandipan and
Choudhury, Monojit and
Ganu, Tanuja and
Bali, Kalika",
booktitle = "Proceedings of the Seventh Conference on Machine Translation (WMT)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wmt-1.80",
pages = "870--885",
abstract = "Leveraging shared learning through Massively Multilingual Models, state-of-the-art Machine translation (MT) models are often able to adapt to the paucity of data for low-resource languages. However, this performance comes at the cost of significantly bloated models which aren{'}t practically deployable. Knowledge Distillation is one popular technique to develop competitive lightweight models: In this work, we first evaluate its use in compressing MT models, focusing specifically on languages with extremely limited training data. Through our analysis across 8 languages, we find that the variance in the performance of the distilled models due to their dependence on priors including the amount of synthetic data used for distillation, the student architecture, training hyper-parameters and confidence of the teacher models, makes distillation a brittle compression mechanism. To mitigate this, we further explore the use of post-training quantization for the compression of these models. Here, we find that while Distillation provides gains across some low-resource languages, Quantization provides more consistent performance trends for the entire range of languages, especially the lowest-resource languages in our target set.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="diddee-etal-2022-brittle">
<titleInfo>
<title>Too Brittle to Touch: Comparing the Stability of Quantization and Distillation towards Developing Low-Resource MT Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Harshita</namePart>
<namePart type="family">Diddee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandipan</namePart>
<namePart type="family">Dandapat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monojit</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanuja</namePart>
<namePart type="family">Ganu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Conference on Machine Translation (WMT)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Leveraging shared learning through Massively Multilingual Models, state-of-the-art Machine translation (MT) models are often able to adapt to the paucity of data for low-resource languages. However, this performance comes at the cost of significantly bloated models which aren’t practically deployable. Knowledge Distillation is one popular technique to develop competitive lightweight models: In this work, we first evaluate its use in compressing MT models, focusing specifically on languages with extremely limited training data. Through our analysis across 8 languages, we find that the variance in the performance of the distilled models due to their dependence on priors including the amount of synthetic data used for distillation, the student architecture, training hyper-parameters and confidence of the teacher models, makes distillation a brittle compression mechanism. To mitigate this, we further explore the use of post-training quantization for the compression of these models. Here, we find that while Distillation provides gains across some low-resource languages, Quantization provides more consistent performance trends for the entire range of languages, especially the lowest-resource languages in our target set.</abstract>
<identifier type="citekey">diddee-etal-2022-brittle</identifier>
<location>
<url>https://aclanthology.org/2022.wmt-1.80</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>870</start>
<end>885</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Too Brittle to Touch: Comparing the Stability of Quantization and Distillation towards Developing Low-Resource MT Models
%A Diddee, Harshita
%A Dandapat, Sandipan
%A Choudhury, Monojit
%A Ganu, Tanuja
%A Bali, Kalika
%S Proceedings of the Seventh Conference on Machine Translation (WMT)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F diddee-etal-2022-brittle
%X Leveraging shared learning through Massively Multilingual Models, state-of-the-art Machine translation (MT) models are often able to adapt to the paucity of data for low-resource languages. However, this performance comes at the cost of significantly bloated models which aren’t practically deployable. Knowledge Distillation is one popular technique to develop competitive lightweight models: In this work, we first evaluate its use in compressing MT models, focusing specifically on languages with extremely limited training data. Through our analysis across 8 languages, we find that the variance in the performance of the distilled models due to their dependence on priors including the amount of synthetic data used for distillation, the student architecture, training hyper-parameters and confidence of the teacher models, makes distillation a brittle compression mechanism. To mitigate this, we further explore the use of post-training quantization for the compression of these models. Here, we find that while Distillation provides gains across some low-resource languages, Quantization provides more consistent performance trends for the entire range of languages, especially the lowest-resource languages in our target set.
%U https://aclanthology.org/2022.wmt-1.80
%P 870-885
Markdown (Informal)
[Too Brittle to Touch: Comparing the Stability of Quantization and Distillation towards Developing Low-Resource MT Models](https://aclanthology.org/2022.wmt-1.80) (Diddee et al., WMT 2022)
ACL