@inproceedings{pan-etal-2026-dart,
title = "{DART}: Mitigating Harm Drift in Difference-Aware {LLM}s via Distill-Audit-Repair Training",
author = "Pan, Ziwen and
Liang, Zihan and
Kabbara, Jad and
Emami, Ali",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.244/",
pages = "4940--4980",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) tuned for safety often avoid acknowledging demographic differences, even when such acknowledgment is factually correct (e.g., ancestry-based disease incidence) or contextually justified (e.g., religious hiring preferences). This *identity-blindness* yields incorrect responses, unnecessary refusals, or generic ``equal-treatment'' defaults. We study this via difference-awareness classification: given a question involving demographic groups, the task is not to answer directly, but to classify whether a correct answer requires recognizing group differences (**YES**) or whether groups should be treated identically (**NO**). Crucially, fine-tuning for accuracy triggers *harm drift*: model-generated explanations become increasingly harmful as decision accuracy improves, whether by elaborating harmful content, introducing problematic assumptions, or failing to flag harms the baseline identified. To mitigate this, we introduce **DART** (**D**istill{--}**A**udit{--}**R**epair **T**raining), which distills label-conditioned reasoning from a teacher, audits outputs for harm drift cases relative to baseline, and repairs problematic cases via severity-weighted fine-tuning. On eight benchmarks, DART improves Llama-3-8B-Instruct accuracy from 39.0{\%} to 68.8{\%}, with largest gains on equal-treatment prompts (11.3{\%} {\textrightarrow} 72.6{\%}), while reducing harm drift cases by 72.6{\%}. It also transfers to 280 open-ended real-world queries across medical, legal, policy, and educational domains, improving difference-appropriate responses from 39.8{\%} to 77.5{\%} while reducing refusals from 34.3{\%} to 3.0{\%}. Our results demonstrate that accuracy and safety need not conflict when explicit detection and repair mechanisms are in place."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pan-etal-2026-dart">
<titleInfo>
<title>DART: Mitigating Harm Drift in Difference-Aware LLMs via Distill-Audit-Repair Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ziwen</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Emami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) tuned for safety often avoid acknowledging demographic differences, even when such acknowledgment is factually correct (e.g., ancestry-based disease incidence) or contextually justified (e.g., religious hiring preferences). This *identity-blindness* yields incorrect responses, unnecessary refusals, or generic “equal-treatment” defaults. We study this via difference-awareness classification: given a question involving demographic groups, the task is not to answer directly, but to classify whether a correct answer requires recognizing group differences (**YES**) or whether groups should be treated identically (**NO**). Crucially, fine-tuning for accuracy triggers *harm drift*: model-generated explanations become increasingly harmful as decision accuracy improves, whether by elaborating harmful content, introducing problematic assumptions, or failing to flag harms the baseline identified. To mitigate this, we introduce **DART** (**D**istill–**A**udit–**R**epair **T**raining), which distills label-conditioned reasoning from a teacher, audits outputs for harm drift cases relative to baseline, and repairs problematic cases via severity-weighted fine-tuning. On eight benchmarks, DART improves Llama-3-8B-Instruct accuracy from 39.0% to 68.8%, with largest gains on equal-treatment prompts (11.3% → 72.6%), while reducing harm drift cases by 72.6%. It also transfers to 280 open-ended real-world queries across medical, legal, policy, and educational domains, improving difference-appropriate responses from 39.8% to 77.5% while reducing refusals from 34.3% to 3.0%. Our results demonstrate that accuracy and safety need not conflict when explicit detection and repair mechanisms are in place.</abstract>
<identifier type="citekey">pan-etal-2026-dart</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.244/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>4940</start>
<end>4980</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DART: Mitigating Harm Drift in Difference-Aware LLMs via Distill-Audit-Repair Training
%A Pan, Ziwen
%A Liang, Zihan
%A Kabbara, Jad
%A Emami, Ali
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F pan-etal-2026-dart
%X Large language models (LLMs) tuned for safety often avoid acknowledging demographic differences, even when such acknowledgment is factually correct (e.g., ancestry-based disease incidence) or contextually justified (e.g., religious hiring preferences). This *identity-blindness* yields incorrect responses, unnecessary refusals, or generic “equal-treatment” defaults. We study this via difference-awareness classification: given a question involving demographic groups, the task is not to answer directly, but to classify whether a correct answer requires recognizing group differences (**YES**) or whether groups should be treated identically (**NO**). Crucially, fine-tuning for accuracy triggers *harm drift*: model-generated explanations become increasingly harmful as decision accuracy improves, whether by elaborating harmful content, introducing problematic assumptions, or failing to flag harms the baseline identified. To mitigate this, we introduce **DART** (**D**istill–**A**udit–**R**epair **T**raining), which distills label-conditioned reasoning from a teacher, audits outputs for harm drift cases relative to baseline, and repairs problematic cases via severity-weighted fine-tuning. On eight benchmarks, DART improves Llama-3-8B-Instruct accuracy from 39.0% to 68.8%, with largest gains on equal-treatment prompts (11.3% → 72.6%), while reducing harm drift cases by 72.6%. It also transfers to 280 open-ended real-world queries across medical, legal, policy, and educational domains, improving difference-appropriate responses from 39.8% to 77.5% while reducing refusals from 34.3% to 3.0%. Our results demonstrate that accuracy and safety need not conflict when explicit detection and repair mechanisms are in place.
%U https://aclanthology.org/2026.findings-acl.244/
%P 4940-4980
Markdown (Informal)
[DART: Mitigating Harm Drift in Difference-Aware LLMs via Distill-Audit-Repair Training](https://aclanthology.org/2026.findings-acl.244/) (Pan et al., Findings 2026)
ACL