@inproceedings{zhang-etal-2024-shieldlm,
title = "{S}hield{LM}: Empowering {LLM}s as Aligned, Customizable and Explainable Safety Detectors",
author = "Zhang, Zhexin and
Lu, Yida and
Ma, Jingyuan and
Zhang, Di and
Li, Rui and
Ke, Pei and
Sun, Hao and
Sha, Lei and
Sui, Zhifang and
Wang, Hongning and
Huang, Minlie",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.610/",
doi = "10.18653/v1/2024.findings-emnlp.610",
pages = "10420--10438",
abstract = "The safety of Large Language Models (LLMs) has gained increasing attention in recent years, but there still lacks a comprehensive approach for detecting safety issues within LLMs' responses in an aligned, customizable and explainable manner. In this paper, we propose ShieldLM, an LLM-based safety detector, which aligns with common safety standards, supports customizable detection rules, and provides explanations for its decisions. To train ShieldLM, we compile a large bilingual dataset comprising 14,387 query-response pairs, annotating the safety of responses based on various safety standards. Through extensive experiments, we demonstrate that ShieldLM surpasses strong baselines across four test sets, showcasing remarkable customizability and explainability. Besides performing well on standard detection datasets, ShieldLM has also been shown to be effective as a safety evaluator for advanced LLMs. ShieldLM is released at \url{https://github.com/thu-coai/ShieldLM} to support accurate and explainable safety detection under various safety standards."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-shieldlm">
<titleInfo>
<title>ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhexin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yida</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingyuan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pei</namePart>
<namePart type="family">Ke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Sha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhifang</namePart>
<namePart type="family">Sui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongning</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The safety of Large Language Models (LLMs) has gained increasing attention in recent years, but there still lacks a comprehensive approach for detecting safety issues within LLMs’ responses in an aligned, customizable and explainable manner. In this paper, we propose ShieldLM, an LLM-based safety detector, which aligns with common safety standards, supports customizable detection rules, and provides explanations for its decisions. To train ShieldLM, we compile a large bilingual dataset comprising 14,387 query-response pairs, annotating the safety of responses based on various safety standards. Through extensive experiments, we demonstrate that ShieldLM surpasses strong baselines across four test sets, showcasing remarkable customizability and explainability. Besides performing well on standard detection datasets, ShieldLM has also been shown to be effective as a safety evaluator for advanced LLMs. ShieldLM is released at https://github.com/thu-coai/ShieldLM to support accurate and explainable safety detection under various safety standards.</abstract>
<identifier type="citekey">zhang-etal-2024-shieldlm</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.610</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.610/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>10420</start>
<end>10438</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors
%A Zhang, Zhexin
%A Lu, Yida
%A Ma, Jingyuan
%A Zhang, Di
%A Li, Rui
%A Ke, Pei
%A Sun, Hao
%A Sha, Lei
%A Sui, Zhifang
%A Wang, Hongning
%A Huang, Minlie
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zhang-etal-2024-shieldlm
%X The safety of Large Language Models (LLMs) has gained increasing attention in recent years, but there still lacks a comprehensive approach for detecting safety issues within LLMs’ responses in an aligned, customizable and explainable manner. In this paper, we propose ShieldLM, an LLM-based safety detector, which aligns with common safety standards, supports customizable detection rules, and provides explanations for its decisions. To train ShieldLM, we compile a large bilingual dataset comprising 14,387 query-response pairs, annotating the safety of responses based on various safety standards. Through extensive experiments, we demonstrate that ShieldLM surpasses strong baselines across four test sets, showcasing remarkable customizability and explainability. Besides performing well on standard detection datasets, ShieldLM has also been shown to be effective as a safety evaluator for advanced LLMs. ShieldLM is released at https://github.com/thu-coai/ShieldLM to support accurate and explainable safety detection under various safety standards.
%R 10.18653/v1/2024.findings-emnlp.610
%U https://aclanthology.org/2024.findings-emnlp.610/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.610
%P 10420-10438
Markdown (Informal)
[ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors](https://aclanthology.org/2024.findings-emnlp.610/) (Zhang et al., Findings 2024)
ACL
- Zhexin Zhang, Yida Lu, Jingyuan Ma, Di Zhang, Rui Li, Pei Ke, Hao Sun, Lei Sha, Zhifang Sui, Hongning Wang, and Minlie Huang. 2024. ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 10420–10438, Miami, Florida, USA. Association for Computational Linguistics.