@inproceedings{zhang-etal-2023-instructsafety,
title = "{I}nstruct{S}afety: A Unified Framework for Building Multidimensional and Explainable Safety Detector through Instruction Tuning",
author = "Zhang, Zhexin and
Cheng, Jiale and
Sun, Hao and
Deng, Jiawen and
Huang, Minlie",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.700/",
doi = "10.18653/v1/2023.findings-emnlp.700",
pages = "10421--10436",
abstract = "Safety detection has been an increasingly important topic in recent years and it has become even more necessary to develop reliable safety detection systems with the rapid development of large language models. However, currently available safety detection systems have limitations in terms of their versatility and interpretability. In this paper, we first introduce InstructSafety, a safety detection framework that unifies 7 common sub-tasks for safety detection. These tasks are unified into a similar form through different instructions. We then conduct a comprehensive survey of existing safety detection datasets and process 39 human-annotated datasets for instruction tuning. We also construct adversarial samples to enhance the model`s robustness. After fine-tuning Flan-T5 on the collected data, we have developed Safety-Flan-T5, a multidimensional and explainable safety detector. We conduct comprehensive experiments on a variety of datasets and tasks, and demonstrate the strong performance of Safety-Flan-T5 in comparison to supervised baselines and served APIs (Perspective API, ChatGPT and InstructGPT). We will release the processed data, fine-tuned Safety-Flan-T5 and related code for public use."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2023-instructsafety">
<titleInfo>
<title>InstructSafety: A Unified Framework for Building Multidimensional and Explainable Safety Detector through Instruction Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhexin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiale</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiawen</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minlie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Safety detection has been an increasingly important topic in recent years and it has become even more necessary to develop reliable safety detection systems with the rapid development of large language models. However, currently available safety detection systems have limitations in terms of their versatility and interpretability. In this paper, we first introduce InstructSafety, a safety detection framework that unifies 7 common sub-tasks for safety detection. These tasks are unified into a similar form through different instructions. We then conduct a comprehensive survey of existing safety detection datasets and process 39 human-annotated datasets for instruction tuning. We also construct adversarial samples to enhance the model‘s robustness. After fine-tuning Flan-T5 on the collected data, we have developed Safety-Flan-T5, a multidimensional and explainable safety detector. We conduct comprehensive experiments on a variety of datasets and tasks, and demonstrate the strong performance of Safety-Flan-T5 in comparison to supervised baselines and served APIs (Perspective API, ChatGPT and InstructGPT). We will release the processed data, fine-tuned Safety-Flan-T5 and related code for public use.</abstract>
<identifier type="citekey">zhang-etal-2023-instructsafety</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.700</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.700/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>10421</start>
<end>10436</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T InstructSafety: A Unified Framework for Building Multidimensional and Explainable Safety Detector through Instruction Tuning
%A Zhang, Zhexin
%A Cheng, Jiale
%A Sun, Hao
%A Deng, Jiawen
%A Huang, Minlie
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F zhang-etal-2023-instructsafety
%X Safety detection has been an increasingly important topic in recent years and it has become even more necessary to develop reliable safety detection systems with the rapid development of large language models. However, currently available safety detection systems have limitations in terms of their versatility and interpretability. In this paper, we first introduce InstructSafety, a safety detection framework that unifies 7 common sub-tasks for safety detection. These tasks are unified into a similar form through different instructions. We then conduct a comprehensive survey of existing safety detection datasets and process 39 human-annotated datasets for instruction tuning. We also construct adversarial samples to enhance the model‘s robustness. After fine-tuning Flan-T5 on the collected data, we have developed Safety-Flan-T5, a multidimensional and explainable safety detector. We conduct comprehensive experiments on a variety of datasets and tasks, and demonstrate the strong performance of Safety-Flan-T5 in comparison to supervised baselines and served APIs (Perspective API, ChatGPT and InstructGPT). We will release the processed data, fine-tuned Safety-Flan-T5 and related code for public use.
%R 10.18653/v1/2023.findings-emnlp.700
%U https://aclanthology.org/2023.findings-emnlp.700/
%U https://doi.org/10.18653/v1/2023.findings-emnlp.700
%P 10421-10436
Markdown (Informal)
[InstructSafety: A Unified Framework for Building Multidimensional and Explainable Safety Detector through Instruction Tuning](https://aclanthology.org/2023.findings-emnlp.700/) (Zhang et al., Findings 2023)
ACL