@inproceedings{yuan-etal-2025-legomt2,
title = "{L}ego{MT}2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation",
author = "Yuan, Fei and
Lu, Yinquan and
Li, Lei and
Xu, Jingjing",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1200/",
doi = "10.18653/v1/2025.findings-acl.1200",
pages = "23359--23376",
ISBN = "979-8-89176-256-5",
abstract = "It is a critical challenge to learn a single model for massive languages. Prior methods focus on increasing the model size and training data size. However, large models are difficult to optimize efficiently even with distributed parallel training and translation capacity can interfere among languages. To address the challenge, we propose LegoMT2, an efficient training approach with an asymmetric multi-way model architecture for massive multilingual neural machine translation. LegoMT2 shards 435 languages into 8 language-centric groups and attributes one local encoder for each group{'}s languages and a mix encoder-decoder for all languages. LegoMT2 trains the model through local data parallel and asynchronous distributed updating of parameters. LegoMT2 is 16.2$\times$ faster than the distributed training method for M2M-100-12B (which only for 100 languages) while improving the translation performance by an average of 2.2 BLEU on \textit{Flores-101}, especially performing better for low-resource languages ."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2025-legomt2">
<titleInfo>
<title>LegoMT2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinquan</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingjing</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>It is a critical challenge to learn a single model for massive languages. Prior methods focus on increasing the model size and training data size. However, large models are difficult to optimize efficiently even with distributed parallel training and translation capacity can interfere among languages. To address the challenge, we propose LegoMT2, an efficient training approach with an asymmetric multi-way model architecture for massive multilingual neural machine translation. LegoMT2 shards 435 languages into 8 language-centric groups and attributes one local encoder for each group’s languages and a mix encoder-decoder for all languages. LegoMT2 trains the model through local data parallel and asynchronous distributed updating of parameters. LegoMT2 is 16.2\times faster than the distributed training method for M2M-100-12B (which only for 100 languages) while improving the translation performance by an average of 2.2 BLEU on Flores-101, especially performing better for low-resource languages .</abstract>
<identifier type="citekey">yuan-etal-2025-legomt2</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1200</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1200/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>23359</start>
<end>23376</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LegoMT2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation
%A Yuan, Fei
%A Lu, Yinquan
%A Li, Lei
%A Xu, Jingjing
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F yuan-etal-2025-legomt2
%X It is a critical challenge to learn a single model for massive languages. Prior methods focus on increasing the model size and training data size. However, large models are difficult to optimize efficiently even with distributed parallel training and translation capacity can interfere among languages. To address the challenge, we propose LegoMT2, an efficient training approach with an asymmetric multi-way model architecture for massive multilingual neural machine translation. LegoMT2 shards 435 languages into 8 language-centric groups and attributes one local encoder for each group’s languages and a mix encoder-decoder for all languages. LegoMT2 trains the model through local data parallel and asynchronous distributed updating of parameters. LegoMT2 is 16.2\times faster than the distributed training method for M2M-100-12B (which only for 100 languages) while improving the translation performance by an average of 2.2 BLEU on Flores-101, especially performing better for low-resource languages .
%R 10.18653/v1/2025.findings-acl.1200
%U https://aclanthology.org/2025.findings-acl.1200/
%U https://doi.org/10.18653/v1/2025.findings-acl.1200
%P 23359-23376
Markdown (Informal)
[LegoMT2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation](https://aclanthology.org/2025.findings-acl.1200/) (Yuan et al., Findings 2025)
ACL