@inproceedings{pernisi-etal-2024-compromesso,
title = "Compromesso! {I}talian Many-Shot Jailbreaks undermine the safety of Large Language Models",
author = "Pernisi, Fabio and
Hovy, Dirk and
R�ttger, Paul",
editor = "Fu, Xiyan and
Fleisig, Eve",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-srw.29/",
doi = "10.18653/v1/2024.acl-srw.29",
pages = "245--251",
abstract = "As diverse linguistic communities and users adopt Large Language Models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to align these models with safe and ethical guidelines, they can still be induced into unsafe behavior with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. What research has been conducted on these vulnerabilities was predominantly on English, limiting the understanding of LLM behavior in other languages. We address this gap by investigating Many-Shot Jailbreaking (MSJ) in Italian, underscoring the importance of understanding LLM behavior in different languages. We base our analysis on a newly created Italian dataset to identify unique safety vulnerabilities in 4 families of open-source LLMs.We find that the models exhibit unsafe behaviors even with minimal exposure to harmful prompts, and{--}more alarmingly{--}this tendency rapidly escalates with more demonstrations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pernisi-etal-2024-compromesso">
<titleInfo>
<title>Compromesso! Italian Many-Shot Jailbreaks undermine the safety of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Pernisi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">R�ttger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiyan</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eve</namePart>
<namePart type="family">Fleisig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As diverse linguistic communities and users adopt Large Language Models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to align these models with safe and ethical guidelines, they can still be induced into unsafe behavior with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. What research has been conducted on these vulnerabilities was predominantly on English, limiting the understanding of LLM behavior in other languages. We address this gap by investigating Many-Shot Jailbreaking (MSJ) in Italian, underscoring the importance of understanding LLM behavior in different languages. We base our analysis on a newly created Italian dataset to identify unique safety vulnerabilities in 4 families of open-source LLMs.We find that the models exhibit unsafe behaviors even with minimal exposure to harmful prompts, and–more alarmingly–this tendency rapidly escalates with more demonstrations.</abstract>
<identifier type="citekey">pernisi-etal-2024-compromesso</identifier>
<identifier type="doi">10.18653/v1/2024.acl-srw.29</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-srw.29/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>245</start>
<end>251</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Compromesso! Italian Many-Shot Jailbreaks undermine the safety of Large Language Models
%A Pernisi, Fabio
%A Hovy, Dirk
%A R�ttger, Paul
%Y Fu, Xiyan
%Y Fleisig, Eve
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F pernisi-etal-2024-compromesso
%X As diverse linguistic communities and users adopt Large Language Models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to align these models with safe and ethical guidelines, they can still be induced into unsafe behavior with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. What research has been conducted on these vulnerabilities was predominantly on English, limiting the understanding of LLM behavior in other languages. We address this gap by investigating Many-Shot Jailbreaking (MSJ) in Italian, underscoring the importance of understanding LLM behavior in different languages. We base our analysis on a newly created Italian dataset to identify unique safety vulnerabilities in 4 families of open-source LLMs.We find that the models exhibit unsafe behaviors even with minimal exposure to harmful prompts, and–more alarmingly–this tendency rapidly escalates with more demonstrations.
%R 10.18653/v1/2024.acl-srw.29
%U https://aclanthology.org/2024.luhme-srw.29/
%U https://doi.org/10.18653/v1/2024.acl-srw.29
%P 245-251
Markdown (Informal)
[Compromesso! Italian Many-Shot Jailbreaks undermine the safety of Large Language Models](https://aclanthology.org/2024.luhme-srw.29/) (Pernisi et al., ACL 2024)
ACL