@inproceedings{chen-etal-2025-p2,
title = "P$^2$ Law: Scaling Law for Post-Training After Model Pruning",
author = "Chen, Xiaodong and
Hu, Yuxuan and
Zhang, Xiaokang and
Wang, Yanling and
Li, Cuiping and
Chen, Hong and
Zhang, Jing",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.283/",
doi = "10.18653/v1/2025.acl-long.283",
pages = "5668--5686",
ISBN = "979-8-89176-251-0",
abstract = "Pruning has become a widely adopted technique for reducing the hardware requirements of large language models (LLMs). To recover model performance after pruning, post-training is commonly employed to mitigate the resulting performance degradation. While post-training benefits from larger datasets, once the dataset size is already substantial, increasing the training data provides only limited performance gains. To balance post-training cost and model performance, it is necessary to explore the optimal amount of post-training data. Through extensive experiments on the Llama-3 and Qwen-2.5 series models, pruned using various common pruning methods, we uncover the scaling \textbf{Law} for \textbf{P}ost-training after model \textbf{P}runing, referred to as the P$^2$ Law. This law identifies four key factors for predicting the pruned model{'}s post-training loss: the model size before pruning, the number of post-training tokens, the pruning rate, and the model{'}s loss before pruning. Moreover, P$^2$ Law can generalize to larger dataset sizes, larger model sizes, and higher pruning rates, offering valuable insights for the post-training of pruned LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-p2">
<titleInfo>
<title>P² Law: Scaling Law for Post-Training After Model Pruning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaodong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxuan</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaokang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanling</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cuiping</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Pruning has become a widely adopted technique for reducing the hardware requirements of large language models (LLMs). To recover model performance after pruning, post-training is commonly employed to mitigate the resulting performance degradation. While post-training benefits from larger datasets, once the dataset size is already substantial, increasing the training data provides only limited performance gains. To balance post-training cost and model performance, it is necessary to explore the optimal amount of post-training data. Through extensive experiments on the Llama-3 and Qwen-2.5 series models, pruned using various common pruning methods, we uncover the scaling Law for Post-training after model Pruning, referred to as the P² Law. This law identifies four key factors for predicting the pruned model’s post-training loss: the model size before pruning, the number of post-training tokens, the pruning rate, and the model’s loss before pruning. Moreover, P² Law can generalize to larger dataset sizes, larger model sizes, and higher pruning rates, offering valuable insights for the post-training of pruned LLMs.</abstract>
<identifier type="citekey">chen-etal-2025-p2</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.283</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.283/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5668</start>
<end>5686</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T P² Law: Scaling Law for Post-Training After Model Pruning
%A Chen, Xiaodong
%A Hu, Yuxuan
%A Zhang, Xiaokang
%A Wang, Yanling
%A Li, Cuiping
%A Chen, Hong
%A Zhang, Jing
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chen-etal-2025-p2
%X Pruning has become a widely adopted technique for reducing the hardware requirements of large language models (LLMs). To recover model performance after pruning, post-training is commonly employed to mitigate the resulting performance degradation. While post-training benefits from larger datasets, once the dataset size is already substantial, increasing the training data provides only limited performance gains. To balance post-training cost and model performance, it is necessary to explore the optimal amount of post-training data. Through extensive experiments on the Llama-3 and Qwen-2.5 series models, pruned using various common pruning methods, we uncover the scaling Law for Post-training after model Pruning, referred to as the P² Law. This law identifies four key factors for predicting the pruned model’s post-training loss: the model size before pruning, the number of post-training tokens, the pruning rate, and the model’s loss before pruning. Moreover, P² Law can generalize to larger dataset sizes, larger model sizes, and higher pruning rates, offering valuable insights for the post-training of pruned LLMs.
%R 10.18653/v1/2025.acl-long.283
%U https://aclanthology.org/2025.acl-long.283/
%U https://doi.org/10.18653/v1/2025.acl-long.283
%P 5668-5686
Markdown (Informal)
[P2 Law: Scaling Law for Post-Training After Model Pruning](https://aclanthology.org/2025.acl-long.283/) (Chen et al., ACL 2025)
ACL
- Xiaodong Chen, Yuxuan Hu, Xiaokang Zhang, Yanling Wang, Cuiping Li, Hong Chen, and Jing Zhang. 2025. P2 Law: Scaling Law for Post-Training After Model Pruning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5668–5686, Vienna, Austria. Association for Computational Linguistics.