@inproceedings{alharbi-etal-2025-evaluating,
title = "Evaluating Large Language Models on Health-Related Claims Across {A}rabic Dialects",
author = "Alharbi, Abdulsalam obaid and
Alsuhaibani, Abdullah and
Alalawi, Abdulrahman Abdullah and
Naseem, Usman and
Jameel, Shoaib and
Kanhere, Salil and
Razzak, Imran",
editor = "El-Haj, Mo",
booktitle = "Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.abjadnlp-1.11/",
pages = "95--103",
abstract = "While the Large Language Models (LLMs) have been popular in different tasks, their capability to handle health-related claims in diverse linguistic and cultural contexts, such as Arabic dialects, Saudi, Egyptian, Lebanese, and Moroccan has not been thoroughly explored. To this end, we develop a comprehensive evaluation framework to assess how LLMs particularly GPT-4 respond to health-related claims. Our framework focuses on measuring factual accuracy, consistency, and cultural adaptability. It introduces a new metric, the {\textquotedblleft}Cultural Sensitivity Score{\textquotedblright}, to evaluate the model`s ability to adjust responses based on dialectal differences. Additionally, the reasoning patterns used by the models are analyzed to assess their effectiveness in engaging with claims across these dialects. Our findings highlight that while LLMs excel in recognizing true claims, they encounter difficulties with mixed and ambiguous claims, especially in underrepresented dialects. This work underscores the importance of dialect-specific evaluations to ensure accurate, contextually appropriate, and culturally sensitive responses from LLMs in real-world applications."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alharbi-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Large Language Models on Health-Related Claims Across Arabic Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abdulsalam</namePart>
<namePart type="given">obaid</namePart>
<namePart type="family">Alharbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullah</namePart>
<namePart type="family">Alsuhaibani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdulrahman</namePart>
<namePart type="given">Abdullah</namePart>
<namePart type="family">Alalawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Naseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shoaib</namePart>
<namePart type="family">Jameel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salil</namePart>
<namePart type="family">Kanhere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imran</namePart>
<namePart type="family">Razzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While the Large Language Models (LLMs) have been popular in different tasks, their capability to handle health-related claims in diverse linguistic and cultural contexts, such as Arabic dialects, Saudi, Egyptian, Lebanese, and Moroccan has not been thoroughly explored. To this end, we develop a comprehensive evaluation framework to assess how LLMs particularly GPT-4 respond to health-related claims. Our framework focuses on measuring factual accuracy, consistency, and cultural adaptability. It introduces a new metric, the “Cultural Sensitivity Score”, to evaluate the model‘s ability to adjust responses based on dialectal differences. Additionally, the reasoning patterns used by the models are analyzed to assess their effectiveness in engaging with claims across these dialects. Our findings highlight that while LLMs excel in recognizing true claims, they encounter difficulties with mixed and ambiguous claims, especially in underrepresented dialects. This work underscores the importance of dialect-specific evaluations to ensure accurate, contextually appropriate, and culturally sensitive responses from LLMs in real-world applications.</abstract>
<identifier type="citekey">alharbi-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.abjadnlp-1.11/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>95</start>
<end>103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Large Language Models on Health-Related Claims Across Arabic Dialects
%A Alharbi, Abdulsalam obaid
%A Alsuhaibani, Abdullah
%A Alalawi, Abdulrahman Abdullah
%A Naseem, Usman
%A Jameel, Shoaib
%A Kanhere, Salil
%A Razzak, Imran
%Y El-Haj, Mo
%S Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F alharbi-etal-2025-evaluating
%X While the Large Language Models (LLMs) have been popular in different tasks, their capability to handle health-related claims in diverse linguistic and cultural contexts, such as Arabic dialects, Saudi, Egyptian, Lebanese, and Moroccan has not been thoroughly explored. To this end, we develop a comprehensive evaluation framework to assess how LLMs particularly GPT-4 respond to health-related claims. Our framework focuses on measuring factual accuracy, consistency, and cultural adaptability. It introduces a new metric, the “Cultural Sensitivity Score”, to evaluate the model‘s ability to adjust responses based on dialectal differences. Additionally, the reasoning patterns used by the models are analyzed to assess their effectiveness in engaging with claims across these dialects. Our findings highlight that while LLMs excel in recognizing true claims, they encounter difficulties with mixed and ambiguous claims, especially in underrepresented dialects. This work underscores the importance of dialect-specific evaluations to ensure accurate, contextually appropriate, and culturally sensitive responses from LLMs in real-world applications.
%U https://aclanthology.org/2025.abjadnlp-1.11/
%P 95-103
Markdown (Informal)
[Evaluating Large Language Models on Health-Related Claims Across Arabic Dialects](https://aclanthology.org/2025.abjadnlp-1.11/) (Alharbi et al., AbjadNLP 2025)
ACL