@inproceedings{lee-etal-2025-stun,
    title = "{STUN}: Structured-Then-Unstructured Pruning for Scalable {M}o{E} Pruning",
    author = "Lee, Jaeseong  and
      Hwang, Seung-won  and
      Qiao, Aurick  and
      Campos, Daniel F  and
      Yao, Zhewei  and
      He, Yuxiong",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.acl-long.671/",
    doi = "10.18653/v1/2025.acl-long.671",
    pages = "13660--13676",
    ISBN = "979-8-89176-251-0",
    abstract = "Mixture-of-experts (MoEs) have been adopted for reducing inference costs by sparsely activating experts in large language models (LLMs). Despite these reductions, the massive number of parameters in MoEs still makes them expensive to serve. Conventionally, unstructured or structured pruning has been considered to reduce number of parameters. Our key contribution is exploring the interpolation between structured and unstructured pruning, to propose a novel structured-then-unstructured (STUN) approach outperforming both of structured or unstructured pruning, especially for MoEs. In the first stage, we show a scalable expert pruning with O(1) forward pass, unlike existing work requiring $O(\frac{k^n}{\sqrt{n}})$ forward passes for $n$ experts that cannot scale for recent MoEs with hundreds of experts. We then show our expert-pruned MoEs are robust to unstructured pruning to follow. Experiments on Snowflake Arctic and Mixtral shows that our proposal is highly effective{--} For Snowflake Arctic, a 480B-sized MoE with 128 experts, our method needs only one H100 and two hours to achieve nearly no loss in performance with 40{\%} sparsity, even in generative tasks such as GSM8K, where state-of-the-art structured or unstructured pruning methods fail. The code is publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2025-stun">
    <titleInfo>
        <title>STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Jaeseong</namePart>
        <namePart type="family">Lee</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Seung-won</namePart>
        <namePart type="family">Hwang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Aurick</namePart>
        <namePart type="family">Qiao</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Daniel</namePart>
        <namePart type="given">F</namePart>
        <namePart type="family">Campos</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Zhewei</namePart>
        <namePart type="family">Yao</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Yuxiong</namePart>
        <namePart type="family">He</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2025-07</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Wanxiang</namePart>
            <namePart type="family">Che</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Joyce</namePart>
            <namePart type="family">Nabende</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Ekaterina</namePart>
            <namePart type="family">Shutova</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Mohammad</namePart>
            <namePart type="given">Taher</namePart>
            <namePart type="family">Pilehvar</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>Association for Computational Linguistics</publisher>
            <place>
                <placeTerm type="text">Vienna, Austria</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
        <identifier type="isbn">979-8-89176-251-0</identifier>
    </relatedItem>
    <abstract>Mixture-of-experts (MoEs) have been adopted for reducing inference costs by sparsely activating experts in large language models (LLMs). Despite these reductions, the massive number of parameters in MoEs still makes them expensive to serve. Conventionally, unstructured or structured pruning has been considered to reduce number of parameters. Our key contribution is exploring the interpolation between structured and unstructured pruning, to propose a novel structured-then-unstructured (STUN) approach outperforming both of structured or unstructured pruning, especially for MoEs. In the first stage, we show a scalable expert pruning with O(1) forward pass, unlike existing work requiring O(\frackⁿ\sqrtn) forward passes for n experts that cannot scale for recent MoEs with hundreds of experts. We then show our expert-pruned MoEs are robust to unstructured pruning to follow. Experiments on Snowflake Arctic and Mixtral shows that our proposal is highly effective– For Snowflake Arctic, a 480B-sized MoE with 128 experts, our method needs only one H100 and two hours to achieve nearly no loss in performance with 40% sparsity, even in generative tasks such as GSM8K, where state-of-the-art structured or unstructured pruning methods fail. The code is publicly available.</abstract>
    <identifier type="citekey">lee-etal-2025-stun</identifier>
    <identifier type="doi">10.18653/v1/2025.acl-long.671</identifier>
    <location>
        <url>https://aclanthology.org/2025.acl-long.671/</url>
    </location>
    <part>
        <date>2025-07</date>
        <extent unit="page">
            <start>13660</start>
            <end>13676</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning
%A Lee, Jaeseong
%A Hwang, Seung-won
%A Qiao, Aurick
%A Campos, Daniel F.
%A Yao, Zhewei
%A He, Yuxiong
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F lee-etal-2025-stun
%X Mixture-of-experts (MoEs) have been adopted for reducing inference costs by sparsely activating experts in large language models (LLMs). Despite these reductions, the massive number of parameters in MoEs still makes them expensive to serve. Conventionally, unstructured or structured pruning has been considered to reduce number of parameters. Our key contribution is exploring the interpolation between structured and unstructured pruning, to propose a novel structured-then-unstructured (STUN) approach outperforming both of structured or unstructured pruning, especially for MoEs. In the first stage, we show a scalable expert pruning with O(1) forward pass, unlike existing work requiring O(\frackⁿ\sqrtn) forward passes for n experts that cannot scale for recent MoEs with hundreds of experts. We then show our expert-pruned MoEs are robust to unstructured pruning to follow. Experiments on Snowflake Arctic and Mixtral shows that our proposal is highly effective– For Snowflake Arctic, a 480B-sized MoE with 128 experts, our method needs only one H100 and two hours to achieve nearly no loss in performance with 40% sparsity, even in generative tasks such as GSM8K, where state-of-the-art structured or unstructured pruning methods fail. The code is publicly available.
%R 10.18653/v1/2025.acl-long.671
%U https://aclanthology.org/2025.acl-long.671/
%U https://doi.org/10.18653/v1/2025.acl-long.671
%P 13660-13676
Markdown (Informal)
[STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning](https://aclanthology.org/2025.acl-long.671/) (Lee et al., ACL 2025)
ACL
- Jaeseong Lee, Seung-won Hwang, Aurick Qiao, Daniel F Campos, Zhewei Yao, and Yuxiong He. 2025. STUN: Structured-Then-Unstructured Pruning for Scalable MoE Pruning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 13660–13676, Vienna, Austria. Association for Computational Linguistics.