@inproceedings{mei-etal-2025-aligned,
title = "{\textquotedblleft}Not Aligned{\textquotedblright} is Not {\textquotedblleft}Malicious{\textquotedblright}: Being Careful about Hallucinations of Large Language Models' Jailbreak",
author = "Mei, Lingrui and
Liu, Shenghua and
Wang, Yiwei and
Bi, Baolong and
Mao, Jiayi and
Cheng, Xueqi",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.146/",
pages = "2144--2162",
abstract = "{\textquotedblleft}Jailbreak{\textquotedblright} is a major safety concern of Large Language Models (LLMs), which occurs when malicious prompts lead LLMs to produce harmful outputs, raising issues about the reliability and safety of LLMs. Therefore, an effective evaluation of jailbreaks is very crucial to develop its mitigation strategies. However, our research reveals that many jailbreaks identified by current evaluations may actually be hallucinations{---}erroneous outputs that are mistaken for genuine safety breaches. This finding suggests that some perceived vulnerabilities might not represent actual threats, indicating a need for more precise red teaming benchmarks. To address this problem, we propose the Benchmark for reliABilitY and jailBreak haLlUcination Evaluation (BabyBLUE). BabyBLUE introduces a specialized validation framework including various evaluators to enhance existing jailbreak benchmarks, ensuring outputs are useful malicious instructions. Additionally, BabyBLUE presents a new dataset as an augmentation to the existing red teaming benchmarks, specifically addressing hallucinations in jailbreaks, aiming to evaluate the true potential of jailbroken LLM outputs to cause harm to human society."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mei-etal-2025-aligned">
<titleInfo>
<title>“Not Aligned” is Not “Malicious”: Being Careful about Hallucinations of Large Language Models’ Jailbreak</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lingrui</namePart>
<namePart type="family">Mei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shenghua</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiwei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baolong</namePart>
<namePart type="family">Bi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xueqi</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Jailbreak” is a major safety concern of Large Language Models (LLMs), which occurs when malicious prompts lead LLMs to produce harmful outputs, raising issues about the reliability and safety of LLMs. Therefore, an effective evaluation of jailbreaks is very crucial to develop its mitigation strategies. However, our research reveals that many jailbreaks identified by current evaluations may actually be hallucinations—erroneous outputs that are mistaken for genuine safety breaches. This finding suggests that some perceived vulnerabilities might not represent actual threats, indicating a need for more precise red teaming benchmarks. To address this problem, we propose the Benchmark for reliABilitY and jailBreak haLlUcination Evaluation (BabyBLUE). BabyBLUE introduces a specialized validation framework including various evaluators to enhance existing jailbreak benchmarks, ensuring outputs are useful malicious instructions. Additionally, BabyBLUE presents a new dataset as an augmentation to the existing red teaming benchmarks, specifically addressing hallucinations in jailbreaks, aiming to evaluate the true potential of jailbroken LLM outputs to cause harm to human society.</abstract>
<identifier type="citekey">mei-etal-2025-aligned</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.146/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>2144</start>
<end>2162</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Not Aligned” is Not “Malicious”: Being Careful about Hallucinations of Large Language Models’ Jailbreak
%A Mei, Lingrui
%A Liu, Shenghua
%A Wang, Yiwei
%A Bi, Baolong
%A Mao, Jiayi
%A Cheng, Xueqi
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F mei-etal-2025-aligned
%X “Jailbreak” is a major safety concern of Large Language Models (LLMs), which occurs when malicious prompts lead LLMs to produce harmful outputs, raising issues about the reliability and safety of LLMs. Therefore, an effective evaluation of jailbreaks is very crucial to develop its mitigation strategies. However, our research reveals that many jailbreaks identified by current evaluations may actually be hallucinations—erroneous outputs that are mistaken for genuine safety breaches. This finding suggests that some perceived vulnerabilities might not represent actual threats, indicating a need for more precise red teaming benchmarks. To address this problem, we propose the Benchmark for reliABilitY and jailBreak haLlUcination Evaluation (BabyBLUE). BabyBLUE introduces a specialized validation framework including various evaluators to enhance existing jailbreak benchmarks, ensuring outputs are useful malicious instructions. Additionally, BabyBLUE presents a new dataset as an augmentation to the existing red teaming benchmarks, specifically addressing hallucinations in jailbreaks, aiming to evaluate the true potential of jailbroken LLM outputs to cause harm to human society.
%U https://aclanthology.org/2025.coling-main.146/
%P 2144-2162
Markdown (Informal)
[“Not Aligned” is Not “Malicious”: Being Careful about Hallucinations of Large Language Models’ Jailbreak](https://aclanthology.org/2025.coling-main.146/) (Mei et al., COLING 2025)
ACL