@inproceedings{sun-etal-2024-mixture,
title = "Mixture of Diverse Size Experts",
author = "Sun, Manxi and
Liu, Wei and
Luan, Jian and
Gao, Pengzhi and
Wang, Bin",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.118",
pages = "1608--1621",
abstract = "The Sparsely-Activated Mixture-of-Experts (MoE) architecture has gained popularity for scaling large language models (LLMs) due to the sub-linearly increasing computational costs. Despite its success, most of the current structure designs face the challenge that the experts share the same size such that tokens have no chance to choose the experts with the most appropriate size to generate the next token. To migrate this defect, we propose Mixture of Diverse Size Experts (MoDSE), a new MoE architecture with designed layers where experts have different sizes. Analysis on difficult token generation tasks shows that experts with different sizes give better predictions, and the routing path of the experts tends to be stable after a period of training. The diversity of experts{'} size will lead to load unbalancing. To tackle this limitation, we introduce an expert-pair allocation strategy to distribute the workload evenly across the GPUs. Comprehensive evaluations across multiple benchmarks demonstrate the effectiveness of MoDSE, surpassing existing MoEs by adaptively assigning the parameter budget to experts while maintaining the same total parameter size and number of experts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2024-mixture">
<titleInfo>
<title>Mixture of Diverse Size Experts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manxi</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Luan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengzhi</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Sparsely-Activated Mixture-of-Experts (MoE) architecture has gained popularity for scaling large language models (LLMs) due to the sub-linearly increasing computational costs. Despite its success, most of the current structure designs face the challenge that the experts share the same size such that tokens have no chance to choose the experts with the most appropriate size to generate the next token. To migrate this defect, we propose Mixture of Diverse Size Experts (MoDSE), a new MoE architecture with designed layers where experts have different sizes. Analysis on difficult token generation tasks shows that experts with different sizes give better predictions, and the routing path of the experts tends to be stable after a period of training. The diversity of experts’ size will lead to load unbalancing. To tackle this limitation, we introduce an expert-pair allocation strategy to distribute the workload evenly across the GPUs. Comprehensive evaluations across multiple benchmarks demonstrate the effectiveness of MoDSE, surpassing existing MoEs by adaptively assigning the parameter budget to experts while maintaining the same total parameter size and number of experts.</abstract>
<identifier type="citekey">sun-etal-2024-mixture</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.118</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1608</start>
<end>1621</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mixture of Diverse Size Experts
%A Sun, Manxi
%A Liu, Wei
%A Luan, Jian
%A Gao, Pengzhi
%A Wang, Bin
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F sun-etal-2024-mixture
%X The Sparsely-Activated Mixture-of-Experts (MoE) architecture has gained popularity for scaling large language models (LLMs) due to the sub-linearly increasing computational costs. Despite its success, most of the current structure designs face the challenge that the experts share the same size such that tokens have no chance to choose the experts with the most appropriate size to generate the next token. To migrate this defect, we propose Mixture of Diverse Size Experts (MoDSE), a new MoE architecture with designed layers where experts have different sizes. Analysis on difficult token generation tasks shows that experts with different sizes give better predictions, and the routing path of the experts tends to be stable after a period of training. The diversity of experts’ size will lead to load unbalancing. To tackle this limitation, we introduce an expert-pair allocation strategy to distribute the workload evenly across the GPUs. Comprehensive evaluations across multiple benchmarks demonstrate the effectiveness of MoDSE, surpassing existing MoEs by adaptively assigning the parameter budget to experts while maintaining the same total parameter size and number of experts.
%U https://aclanthology.org/2024.emnlp-industry.118
%P 1608-1621
Markdown (Informal)
[Mixture of Diverse Size Experts](https://aclanthology.org/2024.emnlp-industry.118) (Sun et al., EMNLP 2024)
ACL
- Manxi Sun, Wei Liu, Jian Luan, Pengzhi Gao, and Bin Wang. 2024. Mixture of Diverse Size Experts. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1608–1621, Miami, Florida, US. Association for Computational Linguistics.