@inproceedings{choi-etal-2026-finharmbench,
title = "{F}in{H}arm{B}ench: Financial Jailbreak Benchmark and Unsupervised Safety Fine-Tuning via Refusal Steering Distillation",
author = "Choi, Yubin and
Yang, Yujin and
Kim, Subin and
Ham, Seokil and
Cho, Seungju and
Son, Jungmin and
Kwak, Youngjun and
Kim, Changick",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.117/",
pages = "1714--1726",
ISBN = "979-8-89176-394-4",
abstract = "Financial Large Language Models (LLMs) exhibit strong domain expertise but remain vulnerable to financially harmful prompts. To systematically assess this vulnerability, we introduce \textbf{FinHarmBench}, a benchmark designed to evaluate financially harmful and confusable benign prompts. Our analysis reveals a concerning result that financial LLMs can be less robust than general-purpose models, suggesting that domain adaptation alone does not guarantee financial safety alignment. To address this issue, we propose \textbf{Financial Refusal Steering Distillation (FiRSD)}, an unsupervised training framework that strengthens financial-domain safety by learning and distilling a financial refusal direction at the representation level. FiRSD enhances refusal behavior without requiring annotated refusal responses. Experiments show that FiRSD substantially improves safety while largely preserving task capability. These results highlight the importance of domain-aware safety alignment for high-stakes financial applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="choi-etal-2026-finharmbench">
<titleInfo>
<title>FinHarmBench: Financial Jailbreak Benchmark and Unsupervised Safety Fine-Tuning via Refusal Steering Distillation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yubin</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokil</namePart>
<namePart type="family">Ham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungju</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungmin</namePart>
<namePart type="family">Son</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngjun</namePart>
<namePart type="family">Kwak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changick</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Financial Large Language Models (LLMs) exhibit strong domain expertise but remain vulnerable to financially harmful prompts. To systematically assess this vulnerability, we introduce FinHarmBench, a benchmark designed to evaluate financially harmful and confusable benign prompts. Our analysis reveals a concerning result that financial LLMs can be less robust than general-purpose models, suggesting that domain adaptation alone does not guarantee financial safety alignment. To address this issue, we propose Financial Refusal Steering Distillation (FiRSD), an unsupervised training framework that strengthens financial-domain safety by learning and distilling a financial refusal direction at the representation level. FiRSD enhances refusal behavior without requiring annotated refusal responses. Experiments show that FiRSD substantially improves safety while largely preserving task capability. These results highlight the importance of domain-aware safety alignment for high-stakes financial applications.</abstract>
<identifier type="citekey">choi-etal-2026-finharmbench</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.117/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1714</start>
<end>1726</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FinHarmBench: Financial Jailbreak Benchmark and Unsupervised Safety Fine-Tuning via Refusal Steering Distillation
%A Choi, Yubin
%A Yang, Yujin
%A Kim, Subin
%A Ham, Seokil
%A Cho, Seungju
%A Son, Jungmin
%A Kwak, Youngjun
%A Kim, Changick
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F choi-etal-2026-finharmbench
%X Financial Large Language Models (LLMs) exhibit strong domain expertise but remain vulnerable to financially harmful prompts. To systematically assess this vulnerability, we introduce FinHarmBench, a benchmark designed to evaluate financially harmful and confusable benign prompts. Our analysis reveals a concerning result that financial LLMs can be less robust than general-purpose models, suggesting that domain adaptation alone does not guarantee financial safety alignment. To address this issue, we propose Financial Refusal Steering Distillation (FiRSD), an unsupervised training framework that strengthens financial-domain safety by learning and distilling a financial refusal direction at the representation level. FiRSD enhances refusal behavior without requiring annotated refusal responses. Experiments show that FiRSD substantially improves safety while largely preserving task capability. These results highlight the importance of domain-aware safety alignment for high-stakes financial applications.
%U https://aclanthology.org/2026.acl-industry.117/
%P 1714-1726
Markdown (Informal)
[FinHarmBench: Financial Jailbreak Benchmark and Unsupervised Safety Fine-Tuning via Refusal Steering Distillation](https://aclanthology.org/2026.acl-industry.117/) (Choi et al., ACL 2026)
ACL
- Yubin Choi, Yujin Yang, Subin Kim, Seokil Ham, Seungju Cho, Jungmin Son, Youngjun Kwak, and Changick Kim. 2026. FinHarmBench: Financial Jailbreak Benchmark and Unsupervised Safety Fine-Tuning via Refusal Steering Distillation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1714–1726, San Diego, California, USA. Association for Computational Linguistics.