@inproceedings{alhassoun-etal-2025-saudi,
title = "Saudi-Alignment Benchmark: Assessing {LLM}s Alignment with Cultural Norms and Domain Knowledge in the Saudi Context",
author = "Alhassoun, Manal and
Alkhanen, Imaan Mohammed and
Alshalawi, Nouf and
Baazeem, Ibtehal and
Alsanie, Waleed",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.11/",
pages = "130--147",
ISBN = "979-8-89176-352-4",
abstract = "For effective use in specific countries, Large Language Models (LLMs) need a strong grasp of local culture and core knowledge to ensure socially appropriate, context-aware, and factually correct responses. Existing Arabic and Saudi benchmarks are limited, focusing mainly on dialects or lifestyle, with little attention to deeper cultural or domain-specific alignment from authoritative sources. To address this gap and the challenge LLMs face with non-Western cultural nuance, this study introduces the Saudi-Alignment Benchmark. It consists of 874 manually curated questions across two core cultural dimensions: Saudi Cultural and Ethical Norms, and Saudi Domain Knowledge. These questions span multiple subcategories and use three formats to assess different goals with verified sources. Our evaluation reveals significant variance in LLM alignment. GPT-4 achieved the highest overall accuracy (83.3{\%}), followed by ALLaM-7B (81.8{\%}) and Llama-3.3-70B (81.6{\%}), whereas Jais-30B exhibited a pronounced shortfall at 21.9{\%}. Furthermore, multilingual LLMs excelled in norms; ALLaM-7B in domain knowledge. Considering the effect of question format, LLMs generally excelled in selected-response formats but showed weaker results on generative tasks, indicating that recognition-based benchmarks alone may overestimate cultural and contextual alignment. These findings highlight the need for tailored benchmarks and reveal LLMs' limitations in achieving cultural grounding, particularly in underrepresented contexts like Saudi Arabia."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alhassoun-etal-2025-saudi">
<titleInfo>
<title>Saudi-Alignment Benchmark: Assessing LLMs Alignment with Cultural Norms and Domain Knowledge in the Saudi Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manal</namePart>
<namePart type="family">Alhassoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imaan</namePart>
<namePart type="given">Mohammed</namePart>
<namePart type="family">Alkhanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nouf</namePart>
<namePart type="family">Alshalawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibtehal</namePart>
<namePart type="family">Baazeem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waleed</namePart>
<namePart type="family">Alsanie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Third Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharefah</namePart>
<namePart type="family">Al-Ghamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakhar</namePart>
<namePart type="family">Alkhereyf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Areeb</namePart>
<namePart type="family">Alowisheq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waad</namePart>
<namePart type="family">Alshammari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-352-4</identifier>
</relatedItem>
<abstract>For effective use in specific countries, Large Language Models (LLMs) need a strong grasp of local culture and core knowledge to ensure socially appropriate, context-aware, and factually correct responses. Existing Arabic and Saudi benchmarks are limited, focusing mainly on dialects or lifestyle, with little attention to deeper cultural or domain-specific alignment from authoritative sources. To address this gap and the challenge LLMs face with non-Western cultural nuance, this study introduces the Saudi-Alignment Benchmark. It consists of 874 manually curated questions across two core cultural dimensions: Saudi Cultural and Ethical Norms, and Saudi Domain Knowledge. These questions span multiple subcategories and use three formats to assess different goals with verified sources. Our evaluation reveals significant variance in LLM alignment. GPT-4 achieved the highest overall accuracy (83.3%), followed by ALLaM-7B (81.8%) and Llama-3.3-70B (81.6%), whereas Jais-30B exhibited a pronounced shortfall at 21.9%. Furthermore, multilingual LLMs excelled in norms; ALLaM-7B in domain knowledge. Considering the effect of question format, LLMs generally excelled in selected-response formats but showed weaker results on generative tasks, indicating that recognition-based benchmarks alone may overestimate cultural and contextual alignment. These findings highlight the need for tailored benchmarks and reveal LLMs’ limitations in achieving cultural grounding, particularly in underrepresented contexts like Saudi Arabia.</abstract>
<identifier type="citekey">alhassoun-etal-2025-saudi</identifier>
<location>
<url>https://aclanthology.org/2025.arabicnlp-main.11/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>130</start>
<end>147</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Saudi-Alignment Benchmark: Assessing LLMs Alignment with Cultural Norms and Domain Knowledge in the Saudi Context
%A Alhassoun, Manal
%A Alkhanen, Imaan Mohammed
%A Alshalawi, Nouf
%A Baazeem, Ibtehal
%A Alsanie, Waleed
%Y Darwish, Kareem
%Y Ali, Ahmed
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%Y Zitouni, Imed
%Y Abdelali, Ahmed
%Y Al-Ghamdi, Sharefah
%Y Alkhereyf, Sakhar
%Y Zaghouani, Wajdi
%Y Khalifa, Salam
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Hamed, Injy
%Y Alyafeai, Zaid
%Y Alowisheq, Areeb
%Y Inoue, Go
%Y Mrini, Khalil
%Y Alshammari, Waad
%S Proceedings of The Third Arabic Natural Language Processing Conference
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-352-4
%F alhassoun-etal-2025-saudi
%X For effective use in specific countries, Large Language Models (LLMs) need a strong grasp of local culture and core knowledge to ensure socially appropriate, context-aware, and factually correct responses. Existing Arabic and Saudi benchmarks are limited, focusing mainly on dialects or lifestyle, with little attention to deeper cultural or domain-specific alignment from authoritative sources. To address this gap and the challenge LLMs face with non-Western cultural nuance, this study introduces the Saudi-Alignment Benchmark. It consists of 874 manually curated questions across two core cultural dimensions: Saudi Cultural and Ethical Norms, and Saudi Domain Knowledge. These questions span multiple subcategories and use three formats to assess different goals with verified sources. Our evaluation reveals significant variance in LLM alignment. GPT-4 achieved the highest overall accuracy (83.3%), followed by ALLaM-7B (81.8%) and Llama-3.3-70B (81.6%), whereas Jais-30B exhibited a pronounced shortfall at 21.9%. Furthermore, multilingual LLMs excelled in norms; ALLaM-7B in domain knowledge. Considering the effect of question format, LLMs generally excelled in selected-response formats but showed weaker results on generative tasks, indicating that recognition-based benchmarks alone may overestimate cultural and contextual alignment. These findings highlight the need for tailored benchmarks and reveal LLMs’ limitations in achieving cultural grounding, particularly in underrepresented contexts like Saudi Arabia.
%U https://aclanthology.org/2025.arabicnlp-main.11/
%P 130-147
Markdown (Informal)
[Saudi-Alignment Benchmark: Assessing LLMs Alignment with Cultural Norms and Domain Knowledge in the Saudi Context](https://aclanthology.org/2025.arabicnlp-main.11/) (Alhassoun et al., ArabicNLP 2025)
ACL