@inproceedings{tapaninaho-2025-moep,
title = "{M}o{EP}: Modular Expert Paths for Sample-Efficient Language Modeling",
author = "Tapaninaho, Joonas",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.babylm-main.39/",
pages = "540--547",
ISBN = "TODO",
abstract = "Training language models under tight compute budgets with small training datasets remains challenging for dense decoder-only Transformers, where every token activates the full stack of model parameters. We introduce MoEP (Modular Expert Paths), a sparse decoder-only architecture that enables more selective token activation, which increases model performance and accelerates learning without increasing the total number of parameters. We show that combining model parallelism with Mixture-of-Experts (MoE) style linear projections and a lightweight top-k router outperforms the GPT-2 baseline and stabilizes evaluation performance more quickly."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tapaninaho-2025-moep">
<titleInfo>
<title>MoEP: Modular Expert Paths for Sample-Efficient Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joonas</namePart>
<namePart type="family">Tapaninaho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First BabyLM Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Charpentier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="given">Omer</namePart>
<namePart type="family">Gul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="given">Sanjay</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">TODO</identifier>
</relatedItem>
<abstract>Training language models under tight compute budgets with small training datasets remains challenging for dense decoder-only Transformers, where every token activates the full stack of model parameters. We introduce MoEP (Modular Expert Paths), a sparse decoder-only architecture that enables more selective token activation, which increases model performance and accelerates learning without increasing the total number of parameters. We show that combining model parallelism with Mixture-of-Experts (MoE) style linear projections and a lightweight top-k router outperforms the GPT-2 baseline and stabilizes evaluation performance more quickly.</abstract>
<identifier type="citekey">tapaninaho-2025-moep</identifier>
<location>
<url>https://aclanthology.org/2025.babylm-main.39/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>540</start>
<end>547</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MoEP: Modular Expert Paths for Sample-Efficient Language Modeling
%A Tapaninaho, Joonas
%Y Charpentier, Lucas
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Gul, Mustafa Omer
%Y Hu, Michael Y.
%Y Liu, Jing
%Y Jumelet, Jaap
%Y Linzen, Tal
%Y Mueller, Aaron
%Y Ross, Candace
%Y Shah, Raj Sanjay
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%Y Williams, Adina
%S Proceedings of the First BabyLM Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ TODO
%F tapaninaho-2025-moep
%X Training language models under tight compute budgets with small training datasets remains challenging for dense decoder-only Transformers, where every token activates the full stack of model parameters. We introduce MoEP (Modular Expert Paths), a sparse decoder-only architecture that enables more selective token activation, which increases model performance and accelerates learning without increasing the total number of parameters. We show that combining model parallelism with Mixture-of-Experts (MoE) style linear projections and a lightweight top-k router outperforms the GPT-2 baseline and stabilizes evaluation performance more quickly.
%U https://aclanthology.org/2025.babylm-main.39/
%P 540-547
Markdown (Informal)
[MoEP: Modular Expert Paths for Sample-Efficient Language Modeling](https://aclanthology.org/2025.babylm-main.39/) (Tapaninaho, BabyLM 2025)
ACL