@inproceedings{khalatbari-etal-2024-flatness,
title = "Flatness-Aware Gradient Descent for Safe Conversational {AI}",
author = "Khalatbari, Leila and
Hosseini, Saeid and
Sameti, Hossein and
Fung, Pascale",
editor = "Ovalle, Anaelia and
Chang, Kai-Wei and
Cao, Yang Trista and
Mehrabi, Ninareh and
Zhao, Jieyu and
Galstyan, Aram and
Dhamala, Jwala and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.trustnlp-1.15/",
doi = "10.18653/v1/2024.trustnlp-1.15",
pages = "187--195",
abstract = "As generative dialog models become ubiquitous in real-world applications, it is paramount to ensure a harmless generation. There are two major challenges when enforcing safety to open-domain chatbots. Firstly, it is impractical to provide training data reflecting the desired response to all emerging forms of toxicity (generalisation challenge). Secondly, implementing safety features may compromise the quality of the conversation (trade-off challenge). To tackle the challenges, this paper introduces a regularized fine-tuning approach called FlatGD. By employing a safety-tailored loss, we translate better optimization to more safety. To ensure better optimization, FlatGD penalizes sharp trajectories of loss curve, encouraging flatness of the converged local minima. Experimental results on datasets of {\textquotedblleft}BAD{\textquotedblright} and {\textquotedblleft}prosocial dialog{\textquotedblright} demonstrate that our model outperforms the current baselines in reducing toxicity while preserving the conversation quality. Moreover, compared to other baselines, FlatGD can better generalize to unseen toxic data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khalatbari-etal-2024-flatness">
<titleInfo>
<title>Flatness-Aware Gradient Descent for Safe Conversational AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leila</namePart>
<namePart type="family">Khalatbari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saeid</namePart>
<namePart type="family">Hosseini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hossein</namePart>
<namePart type="family">Sameti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pascale</namePart>
<namePart type="family">Fung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anaelia</namePart>
<namePart type="family">Ovalle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As generative dialog models become ubiquitous in real-world applications, it is paramount to ensure a harmless generation. There are two major challenges when enforcing safety to open-domain chatbots. Firstly, it is impractical to provide training data reflecting the desired response to all emerging forms of toxicity (generalisation challenge). Secondly, implementing safety features may compromise the quality of the conversation (trade-off challenge). To tackle the challenges, this paper introduces a regularized fine-tuning approach called FlatGD. By employing a safety-tailored loss, we translate better optimization to more safety. To ensure better optimization, FlatGD penalizes sharp trajectories of loss curve, encouraging flatness of the converged local minima. Experimental results on datasets of “BAD” and “prosocial dialog” demonstrate that our model outperforms the current baselines in reducing toxicity while preserving the conversation quality. Moreover, compared to other baselines, FlatGD can better generalize to unseen toxic data.</abstract>
<identifier type="citekey">khalatbari-etal-2024-flatness</identifier>
<identifier type="doi">10.18653/v1/2024.trustnlp-1.15</identifier>
<location>
<url>https://aclanthology.org/2024.trustnlp-1.15/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>187</start>
<end>195</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Flatness-Aware Gradient Descent for Safe Conversational AI
%A Khalatbari, Leila
%A Hosseini, Saeid
%A Sameti, Hossein
%A Fung, Pascale
%Y Ovalle, Anaelia
%Y Chang, Kai-Wei
%Y Cao, Yang Trista
%Y Mehrabi, Ninareh
%Y Zhao, Jieyu
%Y Galstyan, Aram
%Y Dhamala, Jwala
%Y Kumar, Anoop
%Y Gupta, Rahul
%S Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F khalatbari-etal-2024-flatness
%X As generative dialog models become ubiquitous in real-world applications, it is paramount to ensure a harmless generation. There are two major challenges when enforcing safety to open-domain chatbots. Firstly, it is impractical to provide training data reflecting the desired response to all emerging forms of toxicity (generalisation challenge). Secondly, implementing safety features may compromise the quality of the conversation (trade-off challenge). To tackle the challenges, this paper introduces a regularized fine-tuning approach called FlatGD. By employing a safety-tailored loss, we translate better optimization to more safety. To ensure better optimization, FlatGD penalizes sharp trajectories of loss curve, encouraging flatness of the converged local minima. Experimental results on datasets of “BAD” and “prosocial dialog” demonstrate that our model outperforms the current baselines in reducing toxicity while preserving the conversation quality. Moreover, compared to other baselines, FlatGD can better generalize to unseen toxic data.
%R 10.18653/v1/2024.trustnlp-1.15
%U https://aclanthology.org/2024.trustnlp-1.15/
%U https://doi.org/10.18653/v1/2024.trustnlp-1.15
%P 187-195
Markdown (Informal)
[Flatness-Aware Gradient Descent for Safe Conversational AI](https://aclanthology.org/2024.trustnlp-1.15/) (Khalatbari et al., TrustNLP 2024)
ACL
- Leila Khalatbari, Saeid Hosseini, Hossein Sameti, and Pascale Fung. 2024. Flatness-Aware Gradient Descent for Safe Conversational AI. In Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP 2024), pages 187–195, Mexico City, Mexico. Association for Computational Linguistics.