@inproceedings{gritsch-etal-2025-nexus,
title = "Nexus: Adaptive Upcycling to Efficiently Pretrain Mixture of Experts",
author = {Gritsch, Nikolas and
Zhang, Qizhen and
Locatelli, Acyr and
Hooker, Sara and
{\"U}st{\"u}n, Ahmet},
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1323/",
pages = "24364--24381",
ISBN = "979-8-89176-335-7",
abstract = "Frontier language models are increasingly based on the Mixture of Experts (MoE) architecture, boosting the efficiency of training and inference by sparsely activating parameters. Nevertheless, training from scratch on trillions of tokens remains so expensive that most users can only finetune these models. In this work, we combine parameter reuse of dense models for the MoE layers (''*upcycling*'') with a novel, *adaptive* Nexus router that can integrate new experts into an existing trained model without hurting the performance on previous domains. Our router leverages the knowledge of each expert{'}s training data distribution via domain embeddings to initialize the router, improving specialization and allowing it to adapt faster to new domains than a standard MoE router. Nexus overturns the strict sequential separation between training and finetuning in classical approaches, allowing more powerful improvements to existing models at a later stage through long token-horizon trainings on new pretraining data. Our experiments show that Nexus achieves a relative gain of up to 2.1{\%} over the baseline for initial upcycling, and an 18.8{\%} relative gain for extending the MoE to a new domain with a new expert by using limited finetuning data. This flexibility of Nexus can power an open-source ecosystem where every user continuously assembles their own MoE-mix from a multitude of dense models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gritsch-etal-2025-nexus">
<titleInfo>
<title>Nexus: Adaptive Upcycling to Efficiently Pretrain Mixture of Experts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolas</namePart>
<namePart type="family">Gritsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qizhen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Acyr</namePart>
<namePart type="family">Locatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmet</namePart>
<namePart type="family">Üstün</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Frontier language models are increasingly based on the Mixture of Experts (MoE) architecture, boosting the efficiency of training and inference by sparsely activating parameters. Nevertheless, training from scratch on trillions of tokens remains so expensive that most users can only finetune these models. In this work, we combine parameter reuse of dense models for the MoE layers (”*upcycling*”) with a novel, *adaptive* Nexus router that can integrate new experts into an existing trained model without hurting the performance on previous domains. Our router leverages the knowledge of each expert’s training data distribution via domain embeddings to initialize the router, improving specialization and allowing it to adapt faster to new domains than a standard MoE router. Nexus overturns the strict sequential separation between training and finetuning in classical approaches, allowing more powerful improvements to existing models at a later stage through long token-horizon trainings on new pretraining data. Our experiments show that Nexus achieves a relative gain of up to 2.1% over the baseline for initial upcycling, and an 18.8% relative gain for extending the MoE to a new domain with a new expert by using limited finetuning data. This flexibility of Nexus can power an open-source ecosystem where every user continuously assembles their own MoE-mix from a multitude of dense models.</abstract>
<identifier type="citekey">gritsch-etal-2025-nexus</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1323/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>24364</start>
<end>24381</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Nexus: Adaptive Upcycling to Efficiently Pretrain Mixture of Experts
%A Gritsch, Nikolas
%A Zhang, Qizhen
%A Locatelli, Acyr
%A Hooker, Sara
%A Üstün, Ahmet
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F gritsch-etal-2025-nexus
%X Frontier language models are increasingly based on the Mixture of Experts (MoE) architecture, boosting the efficiency of training and inference by sparsely activating parameters. Nevertheless, training from scratch on trillions of tokens remains so expensive that most users can only finetune these models. In this work, we combine parameter reuse of dense models for the MoE layers (”*upcycling*”) with a novel, *adaptive* Nexus router that can integrate new experts into an existing trained model without hurting the performance on previous domains. Our router leverages the knowledge of each expert’s training data distribution via domain embeddings to initialize the router, improving specialization and allowing it to adapt faster to new domains than a standard MoE router. Nexus overturns the strict sequential separation between training and finetuning in classical approaches, allowing more powerful improvements to existing models at a later stage through long token-horizon trainings on new pretraining data. Our experiments show that Nexus achieves a relative gain of up to 2.1% over the baseline for initial upcycling, and an 18.8% relative gain for extending the MoE to a new domain with a new expert by using limited finetuning data. This flexibility of Nexus can power an open-source ecosystem where every user continuously assembles their own MoE-mix from a multitude of dense models.
%U https://aclanthology.org/2025.findings-emnlp.1323/
%P 24364-24381
Markdown (Informal)
[Nexus: Adaptive Upcycling to Efficiently Pretrain Mixture of Experts](https://aclanthology.org/2025.findings-emnlp.1323/) (Gritsch et al., Findings 2025)
ACL