@inproceedings{jamil-rafi-2026-code,
title = "Code-Switching as a Safety Failure Mode in Large Language Models: An Empirical Study of {R}oman {U}rdu across {E}nglish, Mixed, and Transliteration-Only Inputs",
author = "Jamil, Waleed and
Rafi, Saima",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.37/",
pages = "295--300",
abstract = "Large Language Models exhibit robust safety alignment when harmful intent is expressed in English, yet their resilience to code-switching and transliteration remains underexplored. This paper presents the first targeted investigation of code-switching as a safety failure mode, focusing on Roman Urdu{---}a widely used transliterated form common in informal and emotionally expressive communication. We introduce the Roman Urdu Adversarial Benchmark (RUAB), a semantically controlled evaluation benchmark designed to isolate linguistic variation from intent across four safety-critical categories: passive suicidal ideation, psychological distress, threat or intimidation, and coercion or emotional manipulation. Evaluating seven state-of-the-art models, we find that safety detection degrades consistently in code-switched and transliterated inputs, with the most pronounced failures occurring for passive suicidal ideation. Instruction-tuned and reasoning-capable models demonstrate greater robustness, suggesting these failures reflect alignment gaps rather than inherent model limitations. Our findings highlight transliteration and code-switching as under-recognized safety risks and motivate the development of linguistically inclusive, transliteration-aware safety methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jamil-rafi-2026-code">
<titleInfo>
<title>Code-Switching as a Safety Failure Mode in Large Language Models: An Empirical Study of Roman Urdu across English, Mixed, and Transliteration-Only Inputs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Waleed</namePart>
<namePart type="family">Jamil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saima</namePart>
<namePart type="family">Rafi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models exhibit robust safety alignment when harmful intent is expressed in English, yet their resilience to code-switching and transliteration remains underexplored. This paper presents the first targeted investigation of code-switching as a safety failure mode, focusing on Roman Urdu—a widely used transliterated form common in informal and emotionally expressive communication. We introduce the Roman Urdu Adversarial Benchmark (RUAB), a semantically controlled evaluation benchmark designed to isolate linguistic variation from intent across four safety-critical categories: passive suicidal ideation, psychological distress, threat or intimidation, and coercion or emotional manipulation. Evaluating seven state-of-the-art models, we find that safety detection degrades consistently in code-switched and transliterated inputs, with the most pronounced failures occurring for passive suicidal ideation. Instruction-tuned and reasoning-capable models demonstrate greater robustness, suggesting these failures reflect alignment gaps rather than inherent model limitations. Our findings highlight transliteration and code-switching as under-recognized safety risks and motivate the development of linguistically inclusive, transliteration-aware safety methods.</abstract>
<identifier type="citekey">jamil-rafi-2026-code</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.37/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>295</start>
<end>300</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Code-Switching as a Safety Failure Mode in Large Language Models: An Empirical Study of Roman Urdu across English, Mixed, and Transliteration-Only Inputs
%A Jamil, Waleed
%A Rafi, Saima
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F jamil-rafi-2026-code
%X Large Language Models exhibit robust safety alignment when harmful intent is expressed in English, yet their resilience to code-switching and transliteration remains underexplored. This paper presents the first targeted investigation of code-switching as a safety failure mode, focusing on Roman Urdu—a widely used transliterated form common in informal and emotionally expressive communication. We introduce the Roman Urdu Adversarial Benchmark (RUAB), a semantically controlled evaluation benchmark designed to isolate linguistic variation from intent across four safety-critical categories: passive suicidal ideation, psychological distress, threat or intimidation, and coercion or emotional manipulation. Evaluating seven state-of-the-art models, we find that safety detection degrades consistently in code-switched and transliterated inputs, with the most pronounced failures occurring for passive suicidal ideation. Instruction-tuned and reasoning-capable models demonstrate greater robustness, suggesting these failures reflect alignment gaps rather than inherent model limitations. Our findings highlight transliteration and code-switching as under-recognized safety risks and motivate the development of linguistically inclusive, transliteration-aware safety methods.
%U https://aclanthology.org/2026.abjadnlp-1.37/
%P 295-300
Markdown (Informal)
[Code-Switching as a Safety Failure Mode in Large Language Models: An Empirical Study of Roman Urdu across English, Mixed, and Transliteration-Only Inputs](https://aclanthology.org/2026.abjadnlp-1.37/) (Jamil & Rafi, AbjadNLP 2026)
ACL