@inproceedings{fatehkia-etal-2026-fanarguard,
title = "{F}anar{G}uard: A Culturally-Aware Moderation Filter for {A}rabic Language Models",
author = "Fatehkia, Masoomali and
Altinisik, Enes and
Sencar, Husrev Taha",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.368/",
pages = "7848--7869",
ISBN = "979-8-89176-380-7",
abstract = "Content moderation filters are a critical safeguard against alignment failures in language models. Yet most existing filters focus narrowly on general safety and overlook cultural context. In this work, we introduce FanarGuard, a bilingual moderation filter that evaluates both safety and cultural alignment in Arabic and English. We construct a dataset of over 468K prompt and response pairs, drawn from synthetic and public datasets, scored by a panel of LLM judges on harmlessness and cultural awareness, and use it to train two filter variants.To rigorously evaluate cultural alignment, we further develop the first benchmark targeting Arabic cultural contexts, comprising over 1K norm-sensitive prompts with LLM-generated responses annotated by human raters. Results show that FanarGuard achieves stronger agreement with human annotations than inter-annotator reliability, while matching the performance of state-of-the-art filters on safety benchmarks. These findings highlight the importance of integrating cultural awareness into moderation and establish FanarGuard as a practical step toward more context-sensitive safeguards."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fatehkia-etal-2026-fanarguard">
<titleInfo>
<title>FanarGuard: A Culturally-Aware Moderation Filter for Arabic Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masoomali</namePart>
<namePart type="family">Fatehkia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enes</namePart>
<namePart type="family">Altinisik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Husrev</namePart>
<namePart type="given">Taha</namePart>
<namePart type="family">Sencar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Content moderation filters are a critical safeguard against alignment failures in language models. Yet most existing filters focus narrowly on general safety and overlook cultural context. In this work, we introduce FanarGuard, a bilingual moderation filter that evaluates both safety and cultural alignment in Arabic and English. We construct a dataset of over 468K prompt and response pairs, drawn from synthetic and public datasets, scored by a panel of LLM judges on harmlessness and cultural awareness, and use it to train two filter variants.To rigorously evaluate cultural alignment, we further develop the first benchmark targeting Arabic cultural contexts, comprising over 1K norm-sensitive prompts with LLM-generated responses annotated by human raters. Results show that FanarGuard achieves stronger agreement with human annotations than inter-annotator reliability, while matching the performance of state-of-the-art filters on safety benchmarks. These findings highlight the importance of integrating cultural awareness into moderation and establish FanarGuard as a practical step toward more context-sensitive safeguards.</abstract>
<identifier type="citekey">fatehkia-etal-2026-fanarguard</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.368/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7848</start>
<end>7869</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FanarGuard: A Culturally-Aware Moderation Filter for Arabic Language Models
%A Fatehkia, Masoomali
%A Altinisik, Enes
%A Sencar, Husrev Taha
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F fatehkia-etal-2026-fanarguard
%X Content moderation filters are a critical safeguard against alignment failures in language models. Yet most existing filters focus narrowly on general safety and overlook cultural context. In this work, we introduce FanarGuard, a bilingual moderation filter that evaluates both safety and cultural alignment in Arabic and English. We construct a dataset of over 468K prompt and response pairs, drawn from synthetic and public datasets, scored by a panel of LLM judges on harmlessness and cultural awareness, and use it to train two filter variants.To rigorously evaluate cultural alignment, we further develop the first benchmark targeting Arabic cultural contexts, comprising over 1K norm-sensitive prompts with LLM-generated responses annotated by human raters. Results show that FanarGuard achieves stronger agreement with human annotations than inter-annotator reliability, while matching the performance of state-of-the-art filters on safety benchmarks. These findings highlight the importance of integrating cultural awareness into moderation and establish FanarGuard as a practical step toward more context-sensitive safeguards.
%U https://aclanthology.org/2026.eacl-long.368/
%P 7848-7869
Markdown (Informal)
[FanarGuard: A Culturally-Aware Moderation Filter for Arabic Language Models](https://aclanthology.org/2026.eacl-long.368/) (Fatehkia et al., EACL 2026)
ACL