@inproceedings{tasawong-etal-2025-shortcut,
title = "Shortcut Learning in Safety: The Impact of Keyword Bias in Safeguards",
author = "Tasawong, Panuthep and
Laosaengpha, Napat and
Ponwitayarat, Wuttikorn and
Lim, Sitiporn and
Manakul, Potsawee and
Cahyawijaya, Samuel and
Udomcharoenchaikit, Can and
Limkonchotiwat, Peerat and
Chuangsuwanich, Ekapol and
Nutanong, Sarana",
editor = "Derczynski, Leon and
Novikova, Jekaterina and
Chen, Muhao",
booktitle = "Proceedings of the The First Workshop on LLM Security (LLMSEC)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.llmsec-1.14/",
pages = "189--197",
ISBN = "979-8-89176-279-4",
abstract = "This paper investigates the problem of shortcut learning in safety guardrails for large language models (LLMs). It reveals that current safeguard models often rely excessively on superficial cues, such as specific keywords that are spuriously correlated with training labels, rather than genuinely understanding the input{'}s semantics or intent. As a result, their performance degrades significantly when there is a shift in keyword distribution. The paper also examines the impact of reducing shortcut reliance, showing that merely minimizing shortcut influence is insufficient. To build robust safeguard models, it is equally crucial to promote the use of intended features."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tasawong-etal-2025-shortcut">
<titleInfo>
<title>Shortcut Learning in Safety: The Impact of Keyword Bias in Safeguards</title>
</titleInfo>
<name type="personal">
<namePart type="given">Panuthep</namePart>
<namePart type="family">Tasawong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Napat</namePart>
<namePart type="family">Laosaengpha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wuttikorn</namePart>
<namePart type="family">Ponwitayarat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sitiporn</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Potsawee</namePart>
<namePart type="family">Manakul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Cahyawijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Can</namePart>
<namePart type="family">Udomcharoenchaikit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peerat</namePart>
<namePart type="family">Limkonchotiwat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekapol</namePart>
<namePart type="family">Chuangsuwanich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarana</namePart>
<namePart type="family">Nutanong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the The First Workshop on LLM Security (LLMSEC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jekaterina</namePart>
<namePart type="family">Novikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-279-4</identifier>
</relatedItem>
<abstract>This paper investigates the problem of shortcut learning in safety guardrails for large language models (LLMs). It reveals that current safeguard models often rely excessively on superficial cues, such as specific keywords that are spuriously correlated with training labels, rather than genuinely understanding the input’s semantics or intent. As a result, their performance degrades significantly when there is a shift in keyword distribution. The paper also examines the impact of reducing shortcut reliance, showing that merely minimizing shortcut influence is insufficient. To build robust safeguard models, it is equally crucial to promote the use of intended features.</abstract>
<identifier type="citekey">tasawong-etal-2025-shortcut</identifier>
<location>
<url>https://aclanthology.org/2025.llmsec-1.14/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>189</start>
<end>197</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shortcut Learning in Safety: The Impact of Keyword Bias in Safeguards
%A Tasawong, Panuthep
%A Laosaengpha, Napat
%A Ponwitayarat, Wuttikorn
%A Lim, Sitiporn
%A Manakul, Potsawee
%A Cahyawijaya, Samuel
%A Udomcharoenchaikit, Can
%A Limkonchotiwat, Peerat
%A Chuangsuwanich, Ekapol
%A Nutanong, Sarana
%Y Derczynski, Leon
%Y Novikova, Jekaterina
%Y Chen, Muhao
%S Proceedings of the The First Workshop on LLM Security (LLMSEC)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-279-4
%F tasawong-etal-2025-shortcut
%X This paper investigates the problem of shortcut learning in safety guardrails for large language models (LLMs). It reveals that current safeguard models often rely excessively on superficial cues, such as specific keywords that are spuriously correlated with training labels, rather than genuinely understanding the input’s semantics or intent. As a result, their performance degrades significantly when there is a shift in keyword distribution. The paper also examines the impact of reducing shortcut reliance, showing that merely minimizing shortcut influence is insufficient. To build robust safeguard models, it is equally crucial to promote the use of intended features.
%U https://aclanthology.org/2025.llmsec-1.14/
%P 189-197
Markdown (Informal)
[Shortcut Learning in Safety: The Impact of Keyword Bias in Safeguards](https://aclanthology.org/2025.llmsec-1.14/) (Tasawong et al., LLMSEC 2025)
ACL
- Panuthep Tasawong, Napat Laosaengpha, Wuttikorn Ponwitayarat, Sitiporn Lim, Potsawee Manakul, Samuel Cahyawijaya, Can Udomcharoenchaikit, Peerat Limkonchotiwat, Ekapol Chuangsuwanich, and Sarana Nutanong. 2025. Shortcut Learning in Safety: The Impact of Keyword Bias in Safeguards. In Proceedings of the The First Workshop on LLM Security (LLMSEC), pages 189–197, Vienna, Austria. Association for Computational Linguistics.