@inproceedings{chamma-etal-2026-mixturekit,
title = "{M}ixture{K}it: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models",
author = "Chamma, Ahmad and
El Herraoui, Omar and
Shang, Guokan",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-demo.15/",
pages = "148--156",
ISBN = "979-8-89176-392-0",
abstract = "We introduce MixtureKit, a modular open-source framework for constructing, training, and analyzing Mixture-of-Experts (MoE) models from arbitrary pre-trained or fine-tuned checkpoints. MixtureKit supports three complementary strategies: (i) Traditional MoE, using a single router per transformer block to select experts; (ii) BTX (Branch-Train-Mix), adding routers at user-specified sub-layers for fine-grained token routing; and (iii) BTS (Branch-Train-Stitch), preserving experts intact and introducing lightweight stitch layers for controlled hub{--}expert information exchange. Given a single configuration dictionary, MixtureKit automatically modifies model configuration, patches decoder and causal LM classes, and exports a unified transformers-compatible checkpoint ready for inference or further fine-tuning. We also provide a visualization interface to inspect token routing, expert weight distributions, and layer-wise contributions. Experiments on multilingual code-switched (Arabic{--}Latin) data show that BTX models built with MixtureKit can outperform dense baselines across multiple benchmarks. The library is accessible at: https://github.com/MBZUAI-Paris/MixtureKit."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chamma-etal-2026-mixturekit">
<titleInfo>
<title>MixtureKit: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Chamma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="family">El Herraoui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guokan</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-392-0</identifier>
</relatedItem>
<abstract>We introduce MixtureKit, a modular open-source framework for constructing, training, and analyzing Mixture-of-Experts (MoE) models from arbitrary pre-trained or fine-tuned checkpoints. MixtureKit supports three complementary strategies: (i) Traditional MoE, using a single router per transformer block to select experts; (ii) BTX (Branch-Train-Mix), adding routers at user-specified sub-layers for fine-grained token routing; and (iii) BTS (Branch-Train-Stitch), preserving experts intact and introducing lightweight stitch layers for controlled hub–expert information exchange. Given a single configuration dictionary, MixtureKit automatically modifies model configuration, patches decoder and causal LM classes, and exports a unified transformers-compatible checkpoint ready for inference or further fine-tuning. We also provide a visualization interface to inspect token routing, expert weight distributions, and layer-wise contributions. Experiments on multilingual code-switched (Arabic–Latin) data show that BTX models built with MixtureKit can outperform dense baselines across multiple benchmarks. The library is accessible at: https://github.com/MBZUAI-Paris/MixtureKit.</abstract>
<identifier type="citekey">chamma-etal-2026-mixturekit</identifier>
<location>
<url>https://aclanthology.org/2026.acl-demo.15/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>148</start>
<end>156</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MixtureKit: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models
%A Chamma, Ahmad
%A El Herraoui, Omar
%A Shang, Guokan
%Y Durrett, Greg
%Y Jian, Ping
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-392-0
%F chamma-etal-2026-mixturekit
%X We introduce MixtureKit, a modular open-source framework for constructing, training, and analyzing Mixture-of-Experts (MoE) models from arbitrary pre-trained or fine-tuned checkpoints. MixtureKit supports three complementary strategies: (i) Traditional MoE, using a single router per transformer block to select experts; (ii) BTX (Branch-Train-Mix), adding routers at user-specified sub-layers for fine-grained token routing; and (iii) BTS (Branch-Train-Stitch), preserving experts intact and introducing lightweight stitch layers for controlled hub–expert information exchange. Given a single configuration dictionary, MixtureKit automatically modifies model configuration, patches decoder and causal LM classes, and exports a unified transformers-compatible checkpoint ready for inference or further fine-tuning. We also provide a visualization interface to inspect token routing, expert weight distributions, and layer-wise contributions. Experiments on multilingual code-switched (Arabic–Latin) data show that BTX models built with MixtureKit can outperform dense baselines across multiple benchmarks. The library is accessible at: https://github.com/MBZUAI-Paris/MixtureKit.
%U https://aclanthology.org/2026.acl-demo.15/
%P 148-156
Markdown (Informal)
[MixtureKit: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models](https://aclanthology.org/2026.acl-demo.15/) (Chamma et al., ACL 2026)
ACL