@inproceedings{kwon-etal-2024-slm,
title = "{SLM} as Guardian: Pioneering {AI} Safety with Small Language Model",
author = "Kwon, Ohjoon and
Jeon, Donghyeon and
Choi, Nayoung and
Cho, Gyu-Hwung and
Jo, Hwiyeol and
Kim, Changbong and
Lee, Hyunwoo and
Kang, Inho and
Kim, Sun and
Park, Taiwoo",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.99",
pages = "1333--1350",
abstract = "Most prior safety research of large language models (LLMs) has focused on enhancing the alignment of LLMs to better suit the safety requirements of their use cases. However, internalizing such safeguard features into larger models brought challenges of higher training cost and unintended degradation of helpfulness. In this paper, we leverage a smaller LLM for both harmful query detection and safeguard response generation. We introduce our safety requirements and the taxonomy of harmfulness categories, and then propose a multi-task learning mechanism fusing the two tasks into a single model. We demonstrate the effectiveness of our approach, providing on par or surpassing harmful query detection and safeguard response performance compared to the publicly available LLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kwon-etal-2024-slm">
<titleInfo>
<title>SLM as Guardian: Pioneering AI Safety with Small Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ohjoon</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Donghyeon</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nayoung</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gyu-Hwung</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hwiyeol</namePart>
<namePart type="family">Jo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changbong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyunwoo</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Inho</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taiwoo</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most prior safety research of large language models (LLMs) has focused on enhancing the alignment of LLMs to better suit the safety requirements of their use cases. However, internalizing such safeguard features into larger models brought challenges of higher training cost and unintended degradation of helpfulness. In this paper, we leverage a smaller LLM for both harmful query detection and safeguard response generation. We introduce our safety requirements and the taxonomy of harmfulness categories, and then propose a multi-task learning mechanism fusing the two tasks into a single model. We demonstrate the effectiveness of our approach, providing on par or surpassing harmful query detection and safeguard response performance compared to the publicly available LLMs.</abstract>
<identifier type="citekey">kwon-etal-2024-slm</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.99</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1333</start>
<end>1350</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SLM as Guardian: Pioneering AI Safety with Small Language Model
%A Kwon, Ohjoon
%A Jeon, Donghyeon
%A Choi, Nayoung
%A Cho, Gyu-Hwung
%A Jo, Hwiyeol
%A Kim, Changbong
%A Lee, Hyunwoo
%A Kang, Inho
%A Kim, Sun
%A Park, Taiwoo
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F kwon-etal-2024-slm
%X Most prior safety research of large language models (LLMs) has focused on enhancing the alignment of LLMs to better suit the safety requirements of their use cases. However, internalizing such safeguard features into larger models brought challenges of higher training cost and unintended degradation of helpfulness. In this paper, we leverage a smaller LLM for both harmful query detection and safeguard response generation. We introduce our safety requirements and the taxonomy of harmfulness categories, and then propose a multi-task learning mechanism fusing the two tasks into a single model. We demonstrate the effectiveness of our approach, providing on par or surpassing harmful query detection and safeguard response performance compared to the publicly available LLMs.
%U https://aclanthology.org/2024.emnlp-industry.99
%P 1333-1350
Markdown (Informal)
[SLM as Guardian: Pioneering AI Safety with Small Language Model](https://aclanthology.org/2024.emnlp-industry.99) (Kwon et al., EMNLP 2024)
ACL
- Ohjoon Kwon, Donghyeon Jeon, Nayoung Choi, Gyu-Hwung Cho, Hwiyeol Jo, Changbong Kim, Hyunwoo Lee, Inho Kang, Sun Kim, and Taiwoo Park. 2024. SLM as Guardian: Pioneering AI Safety with Small Language Model. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1333–1350, Miami, Florida, US. Association for Computational Linguistics.