@inproceedings{li-etal-2025-bayeskd,
title = "{B}ayes{KD}: {B}ayesian Knowledge Distillation for Compact {LLM}s in Constrained Fine-tuning Scenarios",
author = "Li, Wei and
Li, Lujun and
Lee, Mark G. and
Sun, Shengjie and
Zhang, Lei and
Xue, Wei and
Guo, Yike",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.7/",
doi = "10.18653/v1/2025.findings-acl.7",
pages = "138--152",
ISBN = "979-8-89176-256-5",
abstract = "Large language models (LLMs) have revolutionized various domains with their remarkable capabilities, but their massive parameter sizes pose significant challenges for fine-tuning and inference, especially in resource-constrained environments. Conventional compression methods often result in substantial performance degradation within LLMs and struggle to restore model quality during fine-tuning. To address this challenge, we present Bayesian Knowledge Distillation (BayesKD), a novel distillation framework meticulously designed for compact LLMs in resource-constrained fine-tuning scenarios. Departing from conventional LLM distillation methods that introduce time-consuming paradigms and fail to generalize in compressed LLM fine-tuning scenarios, our BayesKD develops the Logits Dual-Scaling, Knowledge Alignment Module, and Bayesian Distillation Optimization. In particular, our Logits Dual-Scaling strategy adaptively aligns the strength of the teacher{'}s knowledge transfer, while the Knowledge Alignment Module bridges the gap between the teacher and student models by projecting their knowledge representations into a shared interval. Additionally, we employ Logits-Aware Bayesian Optimization to swiftly identify optimal settings based on these strategies, thereby enhancing model performance. Extensive experiments across diverse tasks demonstrate that BayesKD consistently outperforms baseline methods on various state-of-the-art LLMs, including LLaMA, Qwen2, Bloom, and Vicuna. Notably, our BayesKD achieves average accuracy gains of 2.99{\%} and 4.05{\%} over standard KD for the 8B parameter LLaMA and Qwen2 model. Codes are available in the supplementary materials."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-bayeskd">
<titleInfo>
<title>BayesKD: Bayesian Knowledge Distillation for Compact LLMs in Constrained Fine-tuning Scenarios</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lujun</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengjie</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yike</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have revolutionized various domains with their remarkable capabilities, but their massive parameter sizes pose significant challenges for fine-tuning and inference, especially in resource-constrained environments. Conventional compression methods often result in substantial performance degradation within LLMs and struggle to restore model quality during fine-tuning. To address this challenge, we present Bayesian Knowledge Distillation (BayesKD), a novel distillation framework meticulously designed for compact LLMs in resource-constrained fine-tuning scenarios. Departing from conventional LLM distillation methods that introduce time-consuming paradigms and fail to generalize in compressed LLM fine-tuning scenarios, our BayesKD develops the Logits Dual-Scaling, Knowledge Alignment Module, and Bayesian Distillation Optimization. In particular, our Logits Dual-Scaling strategy adaptively aligns the strength of the teacher’s knowledge transfer, while the Knowledge Alignment Module bridges the gap between the teacher and student models by projecting their knowledge representations into a shared interval. Additionally, we employ Logits-Aware Bayesian Optimization to swiftly identify optimal settings based on these strategies, thereby enhancing model performance. Extensive experiments across diverse tasks demonstrate that BayesKD consistently outperforms baseline methods on various state-of-the-art LLMs, including LLaMA, Qwen2, Bloom, and Vicuna. Notably, our BayesKD achieves average accuracy gains of 2.99% and 4.05% over standard KD for the 8B parameter LLaMA and Qwen2 model. Codes are available in the supplementary materials.</abstract>
<identifier type="citekey">li-etal-2025-bayeskd</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.7</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.7/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>138</start>
<end>152</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BayesKD: Bayesian Knowledge Distillation for Compact LLMs in Constrained Fine-tuning Scenarios
%A Li, Wei
%A Li, Lujun
%A Lee, Mark G.
%A Sun, Shengjie
%A Zhang, Lei
%A Xue, Wei
%A Guo, Yike
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F li-etal-2025-bayeskd
%X Large language models (LLMs) have revolutionized various domains with their remarkable capabilities, but their massive parameter sizes pose significant challenges for fine-tuning and inference, especially in resource-constrained environments. Conventional compression methods often result in substantial performance degradation within LLMs and struggle to restore model quality during fine-tuning. To address this challenge, we present Bayesian Knowledge Distillation (BayesKD), a novel distillation framework meticulously designed for compact LLMs in resource-constrained fine-tuning scenarios. Departing from conventional LLM distillation methods that introduce time-consuming paradigms and fail to generalize in compressed LLM fine-tuning scenarios, our BayesKD develops the Logits Dual-Scaling, Knowledge Alignment Module, and Bayesian Distillation Optimization. In particular, our Logits Dual-Scaling strategy adaptively aligns the strength of the teacher’s knowledge transfer, while the Knowledge Alignment Module bridges the gap between the teacher and student models by projecting their knowledge representations into a shared interval. Additionally, we employ Logits-Aware Bayesian Optimization to swiftly identify optimal settings based on these strategies, thereby enhancing model performance. Extensive experiments across diverse tasks demonstrate that BayesKD consistently outperforms baseline methods on various state-of-the-art LLMs, including LLaMA, Qwen2, Bloom, and Vicuna. Notably, our BayesKD achieves average accuracy gains of 2.99% and 4.05% over standard KD for the 8B parameter LLaMA and Qwen2 model. Codes are available in the supplementary materials.
%R 10.18653/v1/2025.findings-acl.7
%U https://aclanthology.org/2025.findings-acl.7/
%U https://doi.org/10.18653/v1/2025.findings-acl.7
%P 138-152
Markdown (Informal)
[BayesKD: Bayesian Knowledge Distillation for Compact LLMs in Constrained Fine-tuning Scenarios](https://aclanthology.org/2025.findings-acl.7/) (Li et al., Findings 2025)
ACL