@inproceedings{krasnodebska-etal-2025-pl,
title = "{PL}-Guard: Benchmarking Language Model Safety for {P}olish",
author = "Krasnodebska, Aleksandra and
Seweryn, Karolina and
{\L}ukasik, Szymon and
Kusa, Wojciech",
editor = "Piskorski, Jakub and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Nakov, Preslav and
Yangarber, Roman and
Marcinczuk, Michal",
booktitle = "Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bsnlp-1.4/",
doi = "10.18653/v1/2025.bsnlp-1.4",
pages = "25--37",
ISBN = "978-1-959429-57-9",
abstract = "We present a benchmark dataset for evaluating language model safety in Polish, addressing the underrepresentation of medium-resource languages in existing safety assessments. Our dataset includes both original and adversarially perturbed examples. We fine-tune and evaluate multiple models{---}LlamaGuard-3-8B, a HerBERT-based classifier, and PLLuM{---}and find that the HerBERT-based model outperforms others, especially under adversarial conditions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krasnodebska-etal-2025-pl">
<titleInfo>
<title>PL-Guard: Benchmarking Language Model Safety for Polish</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aleksandra</namePart>
<namePart type="family">Krasnodebska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karolina</namePart>
<namePart type="family">Seweryn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Szymon</namePart>
<namePart type="family">Łukasik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Kusa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Marcinczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-57-9</identifier>
</relatedItem>
<abstract>We present a benchmark dataset for evaluating language model safety in Polish, addressing the underrepresentation of medium-resource languages in existing safety assessments. Our dataset includes both original and adversarially perturbed examples. We fine-tune and evaluate multiple models—LlamaGuard-3-8B, a HerBERT-based classifier, and PLLuM—and find that the HerBERT-based model outperforms others, especially under adversarial conditions.</abstract>
<identifier type="citekey">krasnodebska-etal-2025-pl</identifier>
<identifier type="doi">10.18653/v1/2025.bsnlp-1.4</identifier>
<location>
<url>https://aclanthology.org/2025.bsnlp-1.4/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25</start>
<end>37</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PL-Guard: Benchmarking Language Model Safety for Polish
%A Krasnodebska, Aleksandra
%A Seweryn, Karolina
%A Łukasik, Szymon
%A Kusa, Wojciech
%Y Piskorski, Jakub
%Y Přibáň, Pavel
%Y Nakov, Preslav
%Y Yangarber, Roman
%Y Marcinczuk, Michal
%S Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-57-9
%F krasnodebska-etal-2025-pl
%X We present a benchmark dataset for evaluating language model safety in Polish, addressing the underrepresentation of medium-resource languages in existing safety assessments. Our dataset includes both original and adversarially perturbed examples. We fine-tune and evaluate multiple models—LlamaGuard-3-8B, a HerBERT-based classifier, and PLLuM—and find that the HerBERT-based model outperforms others, especially under adversarial conditions.
%R 10.18653/v1/2025.bsnlp-1.4
%U https://aclanthology.org/2025.bsnlp-1.4/
%U https://doi.org/10.18653/v1/2025.bsnlp-1.4
%P 25-37
Markdown (Informal)
[PL-Guard: Benchmarking Language Model Safety for Polish](https://aclanthology.org/2025.bsnlp-1.4/) (Krasnodebska et al., BSNLP 2025)
ACL
- Aleksandra Krasnodebska, Karolina Seweryn, Szymon Łukasik, and Wojciech Kusa. 2025. PL-Guard: Benchmarking Language Model Safety for Polish. In Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025), pages 25–37, Vienna, Austria. Association for Computational Linguistics.