@inproceedings{longkai-etal-2025-hookmoe,
title = "{H}ook{M}o{E}: A learnable performance compensation strategy of Mixture-of-Experts for {LLM} inference acceleration",
author = "Longkai, Cheng and
He, Along and
Li, Mulin and
Xueshuo, Xie and
Li, Tao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1610/",
doi = "10.18653/v1/2025.emnlp-main.1610",
pages = "31594--31606",
ISBN = "979-8-89176-332-6",
abstract = "Mixture of Experts (MoE) architectures have emerged as a promising paradigm for scaling model capacity through top-$k$ routing mechanisms. Although reducing the number of activated experts inherently enables inference acceleration, this efficiency gain typically comes at the cost of significant performance degradation. To address this trade-off between efficiency and performance, we propose HookMoE, a plug-and-play single-layer compensation framework that effectively restores performance using only a small post-training calibration set. Our method strategically inserts a lightweight trainable Hook module immediately preceding selected transformer blocks. Comprehensive evaluations on four popular MoE models, with an average performance degradation of only 2.5{\%} across various benchmarks, our method reduces the number of activated experts by more than 50{\%} and achieves a 1.42$\times$ inference speed-up during the prefill stage. Through systematic analysis, we further reveal that the upper layers require fewer active experts, offering actionable insights for refining dynamic expert selection strategies and enhancing the overall efficiency of MoE models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="longkai-etal-2025-hookmoe">
<titleInfo>
<title>HookMoE: A learnable performance compensation strategy of Mixture-of-Experts for LLM inference acceleration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Longkai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Along</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mulin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xie</namePart>
<namePart type="family">Xueshuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Mixture of Experts (MoE) architectures have emerged as a promising paradigm for scaling model capacity through top-k routing mechanisms. Although reducing the number of activated experts inherently enables inference acceleration, this efficiency gain typically comes at the cost of significant performance degradation. To address this trade-off between efficiency and performance, we propose HookMoE, a plug-and-play single-layer compensation framework that effectively restores performance using only a small post-training calibration set. Our method strategically inserts a lightweight trainable Hook module immediately preceding selected transformer blocks. Comprehensive evaluations on four popular MoE models, with an average performance degradation of only 2.5% across various benchmarks, our method reduces the number of activated experts by more than 50% and achieves a 1.42\times inference speed-up during the prefill stage. Through systematic analysis, we further reveal that the upper layers require fewer active experts, offering actionable insights for refining dynamic expert selection strategies and enhancing the overall efficiency of MoE models.</abstract>
<identifier type="citekey">longkai-etal-2025-hookmoe</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1610</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1610/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>31594</start>
<end>31606</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HookMoE: A learnable performance compensation strategy of Mixture-of-Experts for LLM inference acceleration
%A Longkai, Cheng
%A He, Along
%A Li, Mulin
%A Xueshuo, Xie
%A Li, Tao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F longkai-etal-2025-hookmoe
%X Mixture of Experts (MoE) architectures have emerged as a promising paradigm for scaling model capacity through top-k routing mechanisms. Although reducing the number of activated experts inherently enables inference acceleration, this efficiency gain typically comes at the cost of significant performance degradation. To address this trade-off between efficiency and performance, we propose HookMoE, a plug-and-play single-layer compensation framework that effectively restores performance using only a small post-training calibration set. Our method strategically inserts a lightweight trainable Hook module immediately preceding selected transformer blocks. Comprehensive evaluations on four popular MoE models, with an average performance degradation of only 2.5% across various benchmarks, our method reduces the number of activated experts by more than 50% and achieves a 1.42\times inference speed-up during the prefill stage. Through systematic analysis, we further reveal that the upper layers require fewer active experts, offering actionable insights for refining dynamic expert selection strategies and enhancing the overall efficiency of MoE models.
%R 10.18653/v1/2025.emnlp-main.1610
%U https://aclanthology.org/2025.emnlp-main.1610/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1610
%P 31594-31606
Markdown (Informal)
[HookMoE: A learnable performance compensation strategy of Mixture-of-Experts for LLM inference acceleration](https://aclanthology.org/2025.emnlp-main.1610/) (Longkai et al., EMNLP 2025)
ACL