@inproceedings{galiano-jimenez-etal-2025-beyond,
title = "Beyond the Mode: Sequence-Level Distillation of Multilingual Translation Models for Low-Resource Language Pairs",
author = "Galiano-Jim{\'e}nez, Aar{\'o}n and
P{\'e}rez-Ortiz, Juan Antonio and
S{\'a}nchez-Mart{\'i}nez, Felipe and
S{\'a}nchez-Cartagena, V{\'i}ctor M.",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.372/",
doi = "10.18653/v1/2025.findings-naacl.372",
pages = "6661--6676",
ISBN = "979-8-89176-195-7",
abstract = "This paper delves into sequence-level knowledge distillation (KD) of multilingual pre-trained translation models. We posit that, beyond the approximated mode obtained via beam search, the whole output distribution of the teacher contains valuable insights for students. We explore the potential of n-best lists from beam search to guide student{'}s learning and then investigate alternative decoding methods to address observed issues like low variability and under-representation of infrequent tokens. Our research in data-limited scenarios reveals that although sampling methods can slightly compromise the translation quality of the teacher output compared to beam search based methods, they enrich the generated corpora with increased variability and lexical richness, ultimately enhancing student model performance and reducing the gender bias amplification commonly associated with KD."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="galiano-jimenez-etal-2025-beyond">
<titleInfo>
<title>Beyond the Mode: Sequence-Level Distillation of Multilingual Translation Models for Low-Resource Language Pairs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aarón</namePart>
<namePart type="family">Galiano-Jiménez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Pérez-Ortiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez-Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Sánchez-Cartagena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>This paper delves into sequence-level knowledge distillation (KD) of multilingual pre-trained translation models. We posit that, beyond the approximated mode obtained via beam search, the whole output distribution of the teacher contains valuable insights for students. We explore the potential of n-best lists from beam search to guide student’s learning and then investigate alternative decoding methods to address observed issues like low variability and under-representation of infrequent tokens. Our research in data-limited scenarios reveals that although sampling methods can slightly compromise the translation quality of the teacher output compared to beam search based methods, they enrich the generated corpora with increased variability and lexical richness, ultimately enhancing student model performance and reducing the gender bias amplification commonly associated with KD.</abstract>
<identifier type="citekey">galiano-jimenez-etal-2025-beyond</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.372</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.372/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6661</start>
<end>6676</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond the Mode: Sequence-Level Distillation of Multilingual Translation Models for Low-Resource Language Pairs
%A Galiano-Jiménez, Aarón
%A Pérez-Ortiz, Juan Antonio
%A Sánchez-Martínez, Felipe
%A Sánchez-Cartagena, Víctor M.
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F galiano-jimenez-etal-2025-beyond
%X This paper delves into sequence-level knowledge distillation (KD) of multilingual pre-trained translation models. We posit that, beyond the approximated mode obtained via beam search, the whole output distribution of the teacher contains valuable insights for students. We explore the potential of n-best lists from beam search to guide student’s learning and then investigate alternative decoding methods to address observed issues like low variability and under-representation of infrequent tokens. Our research in data-limited scenarios reveals that although sampling methods can slightly compromise the translation quality of the teacher output compared to beam search based methods, they enrich the generated corpora with increased variability and lexical richness, ultimately enhancing student model performance and reducing the gender bias amplification commonly associated with KD.
%R 10.18653/v1/2025.findings-naacl.372
%U https://aclanthology.org/2025.findings-naacl.372/
%U https://doi.org/10.18653/v1/2025.findings-naacl.372
%P 6661-6676
Markdown (Informal)
[Beyond the Mode: Sequence-Level Distillation of Multilingual Translation Models for Low-Resource Language Pairs](https://aclanthology.org/2025.findings-naacl.372/) (Galiano-Jiménez et al., Findings 2025)
ACL