@inproceedings{krasnodebska-etal-2025-rainbow,
title = "Rainbow-Teaming for the {P}olish Language: A Reproducibility Study",
author = "Krasnod{\k{e}}bska, Aleksandra and
Chrabaszcz, Maciej and
Kusa, Wojciech",
editor = "Cao, Trista and
Das, Anubrata and
Kumarage, Tharindu and
Wan, Yixin and
Krishna, Satyapriya and
Mehrabi, Ninareh and
Dhamala, Jwala and
Ramakrishna, Anil and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul and
Chang, Kai-Wei",
booktitle = "Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.trustnlp-main.12/",
pages = "155--165",
ISBN = "979-8-89176-233-6",
abstract = "The development of multilingual large language models (LLMs) presents challenges in evaluating their safety across all supported languages. Enhancing safety in one language (e.g., English) may inadvertently introduce vulnerabilities in others. To address this issue, we implement a methodology for the automatic creation of red-teaming datasets for safety evaluation in Polish language. Our approach generates both harmful and non-harmful prompts by sampling different risk categories and attack styles. We test several open-source models, including those trained on Polish data, and evaluate them using metrics such as Attack Success Rate (ASR) and False Reject Rate (FRR). The results reveal clear gaps in safety performance between models and show that better testing across languages is needed."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krasnodebska-etal-2025-rainbow">
<titleInfo>
<title>Rainbow-Teaming for the Polish Language: A Reproducibility Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleksandra</namePart>
<namePart type="family">Krasnodębska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Chrabaszcz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Kusa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trista</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anubrata</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Kumarage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixin</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satyapriya</namePart>
<namePart type="family">Krishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninareh</namePart>
<namePart type="family">Mehrabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jwala</namePart>
<namePart type="family">Dhamala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galystan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-233-6</identifier>
</relatedItem>
<abstract>The development of multilingual large language models (LLMs) presents challenges in evaluating their safety across all supported languages. Enhancing safety in one language (e.g., English) may inadvertently introduce vulnerabilities in others. To address this issue, we implement a methodology for the automatic creation of red-teaming datasets for safety evaluation in Polish language. Our approach generates both harmful and non-harmful prompts by sampling different risk categories and attack styles. We test several open-source models, including those trained on Polish data, and evaluate them using metrics such as Attack Success Rate (ASR) and False Reject Rate (FRR). The results reveal clear gaps in safety performance between models and show that better testing across languages is needed.</abstract>
<identifier type="citekey">krasnodebska-etal-2025-rainbow</identifier>
<location>
<url>https://aclanthology.org/2025.trustnlp-main.12/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>155</start>
<end>165</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rainbow-Teaming for the Polish Language: A Reproducibility Study
%A Krasnodębska, Aleksandra
%A Chrabaszcz, Maciej
%A Kusa, Wojciech
%Y Cao, Trista
%Y Das, Anubrata
%Y Kumarage, Tharindu
%Y Wan, Yixin
%Y Krishna, Satyapriya
%Y Mehrabi, Ninareh
%Y Dhamala, Jwala
%Y Ramakrishna, Anil
%Y Galystan, Aram
%Y Kumar, Anoop
%Y Gupta, Rahul
%Y Chang, Kai-Wei
%S Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-233-6
%F krasnodebska-etal-2025-rainbow
%X The development of multilingual large language models (LLMs) presents challenges in evaluating their safety across all supported languages. Enhancing safety in one language (e.g., English) may inadvertently introduce vulnerabilities in others. To address this issue, we implement a methodology for the automatic creation of red-teaming datasets for safety evaluation in Polish language. Our approach generates both harmful and non-harmful prompts by sampling different risk categories and attack styles. We test several open-source models, including those trained on Polish data, and evaluate them using metrics such as Attack Success Rate (ASR) and False Reject Rate (FRR). The results reveal clear gaps in safety performance between models and show that better testing across languages is needed.
%U https://aclanthology.org/2025.trustnlp-main.12/
%P 155-165
Markdown (Informal)
[Rainbow-Teaming for the Polish Language: A Reproducibility Study](https://aclanthology.org/2025.trustnlp-main.12/) (Krasnodębska et al., TrustNLP 2025)
ACL