@inproceedings{li-etal-2026-lightweight,
title = "Lightweight Haar Wavelet Subband Pruning for {LLM}s",
author = "Li, Jiang and
Cao, Pengfei and
Zhou, Chenxi and
Lan, Tian and
Su, Xiangdong and
Liu, Kang and
Zhao, Jun and
Gao, Guanglai",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.798/",
pages = "16242--16259",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) reach state-of-the-art performance across many NLP tasks, but their large parameter counts introduce heavy computational and memory overhead, which complicates deployment in resource-constrained settings. Pruning is a standard compression strategy that induces sparsity to lower these costs. However, most pruning methods for LLMs depend on calibration data and expensive weight updates, which limits practical scalability. To address these limitations, we introduce \textbf{H}aar \textbf{W}avelet \textbf{S}ubband \textbf{P}runing (), a post-training framework that requires no calibration data and no weight updates. applies a two-dimensional Haar wavelet transform to each weight matrix and decomposes it into four frequency subbands. It then assigns a uniform sparsity ratio to all subbands so that both low- and high-frequency components are retained in a balanced manner. Our theoretical analysis shows that the subband design of provides a deterministic per-subband retention guarantee, which helps mitigate the potential bias of global magnitude pruning toward dominant frequency components. Experiments on the LLaMA, OPT and Qwen model families show that achieves competitive accuracy relative to strong pruning baselines while substantially reducing pruning time. Compared with magnitude pruning, which serves as a simple calibration-free baseline, generally achieves better downstream performance across a wide range of sparsity levels and model scales."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2026-lightweight">
<titleInfo>
<title>Lightweight Haar Wavelet Subband Pruning for LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengfei</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenxi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tian</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangdong</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanglai</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) reach state-of-the-art performance across many NLP tasks, but their large parameter counts introduce heavy computational and memory overhead, which complicates deployment in resource-constrained settings. Pruning is a standard compression strategy that induces sparsity to lower these costs. However, most pruning methods for LLMs depend on calibration data and expensive weight updates, which limits practical scalability. To address these limitations, we introduce Haar Wavelet Subband Pruning (), a post-training framework that requires no calibration data and no weight updates. applies a two-dimensional Haar wavelet transform to each weight matrix and decomposes it into four frequency subbands. It then assigns a uniform sparsity ratio to all subbands so that both low- and high-frequency components are retained in a balanced manner. Our theoretical analysis shows that the subband design of provides a deterministic per-subband retention guarantee, which helps mitigate the potential bias of global magnitude pruning toward dominant frequency components. Experiments on the LLaMA, OPT and Qwen model families show that achieves competitive accuracy relative to strong pruning baselines while substantially reducing pruning time. Compared with magnitude pruning, which serves as a simple calibration-free baseline, generally achieves better downstream performance across a wide range of sparsity levels and model scales.</abstract>
<identifier type="citekey">li-etal-2026-lightweight</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.798/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>16242</start>
<end>16259</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lightweight Haar Wavelet Subband Pruning for LLMs
%A Li, Jiang
%A Cao, Pengfei
%A Zhou, Chenxi
%A Lan, Tian
%A Su, Xiangdong
%A Liu, Kang
%A Zhao, Jun
%A Gao, Guanglai
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F li-etal-2026-lightweight
%X Large language models (LLMs) reach state-of-the-art performance across many NLP tasks, but their large parameter counts introduce heavy computational and memory overhead, which complicates deployment in resource-constrained settings. Pruning is a standard compression strategy that induces sparsity to lower these costs. However, most pruning methods for LLMs depend on calibration data and expensive weight updates, which limits practical scalability. To address these limitations, we introduce Haar Wavelet Subband Pruning (), a post-training framework that requires no calibration data and no weight updates. applies a two-dimensional Haar wavelet transform to each weight matrix and decomposes it into four frequency subbands. It then assigns a uniform sparsity ratio to all subbands so that both low- and high-frequency components are retained in a balanced manner. Our theoretical analysis shows that the subband design of provides a deterministic per-subband retention guarantee, which helps mitigate the potential bias of global magnitude pruning toward dominant frequency components. Experiments on the LLaMA, OPT and Qwen model families show that achieves competitive accuracy relative to strong pruning baselines while substantially reducing pruning time. Compared with magnitude pruning, which serves as a simple calibration-free baseline, generally achieves better downstream performance across a wide range of sparsity levels and model scales.
%U https://aclanthology.org/2026.findings-acl.798/
%P 16242-16259
Markdown (Informal)
[Lightweight Haar Wavelet Subband Pruning for LLMs](https://aclanthology.org/2026.findings-acl.798/) (Li et al., Findings 2026)
ACL
- Jiang Li, Pengfei Cao, Chenxi Zhou, Tian Lan, Xiangdong Su, Kang Liu, Jun Zhao, and Guanglai Gao. 2026. Lightweight Haar Wavelet Subband Pruning for LLMs. In Findings of the Association for Computational Linguistics: ACL 2026, pages 16242–16259, San Diego, California, United States. Association for Computational Linguistics.