@inproceedings{muhammad-etal-2026-indicsteer,
title = "{I}ndic{S}teer: Inference-Time Safety Steering for {I}ndic {LLM}s",
author = "Muhammad, Ruhaib and
Rajaram, Saahas Vijayalakshmi and
Durairaj, Suriya Priyan",
editor = "Ma, Weicheng and
Vosoughi, Soroush and
Gillani, Nabeel and
Coto-Solano, Rolando",
booktitle = "Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies ({S}tere{AC}u{LT} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.stereacult-1.12/",
pages = "126--136",
ISBN = "979-8-89176-408-8",
abstract = "Safety controls for Indic language generation must account for multilingual variation and culturally grounded harm categories that are underrepresented in English-centric resources. We present IndicSteer, an initial study of inference-time activation steering for safety across 8 harm categories and 9 Indic language settings, based on contrastive directions computed from safe/unsafe response pairs. To the best of our knowledge, this is the first application of Contrastive Activation Addition (CAA) to Indic LLMs. Evaluation uses a structured LLM-as-a-judge protocol with strict isolation by category and alpha, covering $\approx$12,960 prompt-response pairs. We report harmful-response and coherence metrics for Sarvam-1 and OpenHathi (Hindi track), and present cross-lingual representation structure via linear CKA for Sarvam-1 and Krutrim-2-Instruct. On matched slices, Sarvam-1 at $\alpha=12$ reduces harmful rate from 73.47{\%} to 41.34{\%} (32.13 pp; 43.73{\%} relative) with no additional retraining. For OpenHathi Hindi, harmful rate falls monotonically from 85.83{\%} (baseline) to 27.13{\%} at $\alpha=15$, a 58.71 pp total reduction."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="muhammad-etal-2026-indicsteer">
<titleInfo>
<title>IndicSteer: Inference-Time Safety Steering for Indic LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruhaib</namePart>
<namePart type="family">Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saahas</namePart>
<namePart type="given">Vijayalakshmi</namePart>
<namePart type="family">Rajaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suriya</namePart>
<namePart type="given">Priyan</namePart>
<namePart type="family">Durairaj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Weicheng</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soroush</namePart>
<namePart type="family">Vosoughi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nabeel</namePart>
<namePart type="family">Gillani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rolando</namePart>
<namePart type="family">Coto-Solano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-408-8</identifier>
</relatedItem>
<abstract>Safety controls for Indic language generation must account for multilingual variation and culturally grounded harm categories that are underrepresented in English-centric resources. We present IndicSteer, an initial study of inference-time activation steering for safety across 8 harm categories and 9 Indic language settings, based on contrastive directions computed from safe/unsafe response pairs. To the best of our knowledge, this is the first application of Contrastive Activation Addition (CAA) to Indic LLMs. Evaluation uses a structured LLM-as-a-judge protocol with strict isolation by category and alpha, covering \approx12,960 prompt-response pairs. We report harmful-response and coherence metrics for Sarvam-1 and OpenHathi (Hindi track), and present cross-lingual representation structure via linear CKA for Sarvam-1 and Krutrim-2-Instruct. On matched slices, Sarvam-1 at α=12 reduces harmful rate from 73.47% to 41.34% (32.13 pp; 43.73% relative) with no additional retraining. For OpenHathi Hindi, harmful rate falls monotonically from 85.83% (baseline) to 27.13% at α=15, a 58.71 pp total reduction.</abstract>
<identifier type="citekey">muhammad-etal-2026-indicsteer</identifier>
<location>
<url>https://aclanthology.org/2026.stereacult-1.12/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>126</start>
<end>136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IndicSteer: Inference-Time Safety Steering for Indic LLMs
%A Muhammad, Ruhaib
%A Rajaram, Saahas Vijayalakshmi
%A Durairaj, Suriya Priyan
%Y Ma, Weicheng
%Y Vosoughi, Soroush
%Y Gillani, Nabeel
%Y Coto-Solano, Rolando
%S Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-408-8
%F muhammad-etal-2026-indicsteer
%X Safety controls for Indic language generation must account for multilingual variation and culturally grounded harm categories that are underrepresented in English-centric resources. We present IndicSteer, an initial study of inference-time activation steering for safety across 8 harm categories and 9 Indic language settings, based on contrastive directions computed from safe/unsafe response pairs. To the best of our knowledge, this is the first application of Contrastive Activation Addition (CAA) to Indic LLMs. Evaluation uses a structured LLM-as-a-judge protocol with strict isolation by category and alpha, covering \approx12,960 prompt-response pairs. We report harmful-response and coherence metrics for Sarvam-1 and OpenHathi (Hindi track), and present cross-lingual representation structure via linear CKA for Sarvam-1 and Krutrim-2-Instruct. On matched slices, Sarvam-1 at α=12 reduces harmful rate from 73.47% to 41.34% (32.13 pp; 43.73% relative) with no additional retraining. For OpenHathi Hindi, harmful rate falls monotonically from 85.83% (baseline) to 27.13% at α=15, a 58.71 pp total reduction.
%U https://aclanthology.org/2026.stereacult-1.12/
%P 126-136
Markdown (Informal)
[IndicSteer: Inference-Time Safety Steering for Indic LLMs](https://aclanthology.org/2026.stereacult-1.12/) (Muhammad et al., StereACuLT 2026)
ACL
- Ruhaib Muhammad, Saahas Vijayalakshmi Rajaram, and Suriya Priyan Durairaj. 2026. IndicSteer: Inference-Time Safety Steering for Indic LLMs. In Proceedings of the 1st Workshop on Stereotypes Across Cultures in Language Technologies (StereACuLT 2026), pages 126–136, San Diego, California, United States. Association for Computational Linguistics.