@inproceedings{pham-etal-2025-fine,
title = "How to Fine-Tune Safely on a Budget: Model Adaptation Using Minimal Resources",
author = "Pham, Anh C. and
Thalanki, Mihir and
Sun, Michael and
Chaloo, Aditya and
Gupta, Ankita and
Xia, Tian and
Mate, Aditya and
Nosakhare, Ehi and
Srinivasan, Soundararajan",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.138/",
pages = "1970--1981",
ISBN = "979-8-89176-333-3",
abstract = "Supervised fine-tuning (SFT) on benign data can paradoxically erode a language model{'}s safety alignment, a phenomenon known as catastrophic forgetting of safety behaviors. Although prior work shows that randomly adding safety examples can reduce harmful output, the principles that make certain examples more effective than others remain poorly understood. This paper investigates the hypothesis that the effectiveness of a safety example is governed by two key factors: its instruction-response behavior (e.g., refusal vs. explanation) and its semantic diversity across harm categories. We systematically evaluate sampling strategies based on these axes and find that structured, diversity-aware sampling significantly improves model safety. Our method reduces harmfulness by up to 41{\%} while adding only 0.05{\%} more data to the fine-tuning set."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pham-etal-2025-fine">
<titleInfo>
<title>How to Fine-Tune Safely on a Budget: Model Adaptation Using Minimal Resources</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Pham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihir</namePart>
<namePart type="family">Thalanki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Chaloo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ankita</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tian</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Mate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehi</namePart>
<namePart type="family">Nosakhare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soundararajan</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Supervised fine-tuning (SFT) on benign data can paradoxically erode a language model’s safety alignment, a phenomenon known as catastrophic forgetting of safety behaviors. Although prior work shows that randomly adding safety examples can reduce harmful output, the principles that make certain examples more effective than others remain poorly understood. This paper investigates the hypothesis that the effectiveness of a safety example is governed by two key factors: its instruction-response behavior (e.g., refusal vs. explanation) and its semantic diversity across harm categories. We systematically evaluate sampling strategies based on these axes and find that structured, diversity-aware sampling significantly improves model safety. Our method reduces harmfulness by up to 41% while adding only 0.05% more data to the fine-tuning set.</abstract>
<identifier type="citekey">pham-etal-2025-fine</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.138/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1970</start>
<end>1981</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How to Fine-Tune Safely on a Budget: Model Adaptation Using Minimal Resources
%A Pham, Anh C.
%A Thalanki, Mihir
%A Sun, Michael
%A Chaloo, Aditya
%A Gupta, Ankita
%A Xia, Tian
%A Mate, Aditya
%A Nosakhare, Ehi
%A Srinivasan, Soundararajan
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F pham-etal-2025-fine
%X Supervised fine-tuning (SFT) on benign data can paradoxically erode a language model’s safety alignment, a phenomenon known as catastrophic forgetting of safety behaviors. Although prior work shows that randomly adding safety examples can reduce harmful output, the principles that make certain examples more effective than others remain poorly understood. This paper investigates the hypothesis that the effectiveness of a safety example is governed by two key factors: its instruction-response behavior (e.g., refusal vs. explanation) and its semantic diversity across harm categories. We systematically evaluate sampling strategies based on these axes and find that structured, diversity-aware sampling significantly improves model safety. Our method reduces harmfulness by up to 41% while adding only 0.05% more data to the fine-tuning set.
%U https://aclanthology.org/2025.emnlp-industry.138/
%P 1970-1981
Markdown (Informal)
[How to Fine-Tune Safely on a Budget: Model Adaptation Using Minimal Resources](https://aclanthology.org/2025.emnlp-industry.138/) (Pham et al., EMNLP 2025)
ACL
- Anh C. Pham, Mihir Thalanki, Michael Sun, Aditya Chaloo, Ankita Gupta, Tian Xia, Aditya Mate, Ehi Nosakhare, and Soundararajan Srinivasan. 2025. How to Fine-Tune Safely on a Budget: Model Adaptation Using Minimal Resources. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1970–1981, Suzhou (China). Association for Computational Linguistics.