@inproceedings{gong-etal-2024-mixture,
title = "Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules",
author = "Gong, Zhuocheng and
Lv, Ang and
Guan, Jian and
Wu, Wei and
Zhang, Huishuai and
Huang, Minlie and
Zhao, Dongyan and
Yan, Rui",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1164",
pages = "20924--20938",
abstract = "Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted {``}yes{''}. In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, can be used to compute a token as long as it possesses the needed processing capabilities. The construction of MoM starts from a finite set of modules defined by multi-head attention and feed-forward networks, each distinguished by its unique parameterization. Two routers then iteratively select attention modules and feed-forward modules from the set to process a token. The selection dynamically expands the computation graph in the forward pass of the token, culminating in an assembly of modules. We show that MoM provides not only a unified framework for Transformers and their numerous variants but also a flexible and learnable approach for reducing redundancy in Transformer parameterization. We pre-train various MoMs using OpenWebText. Empirical results demonstrate that MoMs, of different sizes, consistently outperform vanilla transformers. More interestingly, after removing 50{\%} of the multi-head attention modules and 25{\%} of the feed-forward modules, an MoM model still holds comparable performance. Additionally, by properly adjusting the number of modules and compressing the model depth, one can have an MoM that achieves comparable performance to GPT-2 (774M) while saving 16{\%} TFLOPs and 42{\%} memory usage during forward computation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gong-etal-2024-mixture">
<titleInfo>
<title>Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhuocheng</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ang</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huishuai</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted “yes”. In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, can be used to compute a token as long as it possesses the needed processing capabilities. The construction of MoM starts from a finite set of modules defined by multi-head attention and feed-forward networks, each distinguished by its unique parameterization. Two routers then iteratively select attention modules and feed-forward modules from the set to process a token. The selection dynamically expands the computation graph in the forward pass of the token, culminating in an assembly of modules. We show that MoM provides not only a unified framework for Transformers and their numerous variants but also a flexible and learnable approach for reducing redundancy in Transformer parameterization. We pre-train various MoMs using OpenWebText. Empirical results demonstrate that MoMs, of different sizes, consistently outperform vanilla transformers. More interestingly, after removing 50% of the multi-head attention modules and 25% of the feed-forward modules, an MoM model still holds comparable performance. Additionally, by properly adjusting the number of modules and compressing the model depth, one can have an MoM that achieves comparable performance to GPT-2 (774M) while saving 16% TFLOPs and 42% memory usage during forward computation.</abstract>
<identifier type="citekey">gong-etal-2024-mixture</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1164</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>20924</start>
<end>20938</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules
%A Gong, Zhuocheng
%A Lv, Ang
%A Guan, Jian
%A Wu, Wei
%A Zhang, Huishuai
%A Huang, Minlie
%A Zhao, Dongyan
%A Yan, Rui
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F gong-etal-2024-mixture
%X Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted “yes”. In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, can be used to compute a token as long as it possesses the needed processing capabilities. The construction of MoM starts from a finite set of modules defined by multi-head attention and feed-forward networks, each distinguished by its unique parameterization. Two routers then iteratively select attention modules and feed-forward modules from the set to process a token. The selection dynamically expands the computation graph in the forward pass of the token, culminating in an assembly of modules. We show that MoM provides not only a unified framework for Transformers and their numerous variants but also a flexible and learnable approach for reducing redundancy in Transformer parameterization. We pre-train various MoMs using OpenWebText. Empirical results demonstrate that MoMs, of different sizes, consistently outperform vanilla transformers. More interestingly, after removing 50% of the multi-head attention modules and 25% of the feed-forward modules, an MoM model still holds comparable performance. Additionally, by properly adjusting the number of modules and compressing the model depth, one can have an MoM that achieves comparable performance to GPT-2 (774M) while saving 16% TFLOPs and 42% memory usage during forward computation.
%U https://aclanthology.org/2024.emnlp-main.1164
%P 20924-20938
Markdown (Informal)
[Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules](https://aclanthology.org/2024.emnlp-main.1164) (Gong et al., EMNLP 2024)
ACL
- Zhuocheng Gong, Ang Lv, Jian Guan, Wei Wu, Huishuai Zhang, Minlie Huang, Dongyan Zhao, and Rui Yan. 2024. Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 20924–20938, Miami, Florida, USA. Association for Computational Linguistics.