@inproceedings{kadaoui-etal-2026-jeem,
title = "{JEEM}: Vision-Language Understanding in Four {A}rabic Dialects",
author = "Kadaoui, Karima and
Atwany, Hanin and
Al-Ali, Hamdan and
Mohamed, Abdelrahman and
Mekky, Ali and
Tilga, Sergei and
Fedorova, Natalia and
Artemova, Ekaterina and
Aldarmaki, Hanan and
Kementchedjhieva, Yova",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.18/",
pages = "331--354",
ISBN = "979-8-89176-386-9",
abstract = "We introduce JEEM, a benchmark designed to evaluate Vision-Language Models (VLMs) on visual understanding across four Arabic-speaking countries: Jordan, The Emirates, Egypt, and Morocco. JEEM includes the tasks of image captioning and visual question answering, and features culturally rich and regionally diverse content. This dataset aims to assess the ability of VLMs to generalize across dialects and accurately interpret cultural elements in visual contexts. In an evaluation of five prominent open-source Arabic VLMs and GPT-4o, we find that the Arabic VLMs consistently underperform, struggling with both visual understanding and dialect-specific generation. While GPT-4o ranks best in this comparison, the model{'}s linguistic competence varies across dialects, and its visual understanding capabilities lag behind. This underscores the need for more inclusive models and the value of culturally-diverse evaluation paradigms."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kadaoui-etal-2026-jeem">
<titleInfo>
<title>JEEM: Vision-Language Understanding in Four Arabic Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karima</namePart>
<namePart type="family">Kadaoui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanin</namePart>
<namePart type="family">Atwany</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamdan</namePart>
<namePart type="family">Al-Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Mekky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergei</namePart>
<namePart type="family">Tilga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalia</namePart>
<namePart type="family">Fedorova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanan</namePart>
<namePart type="family">Aldarmaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yova</namePart>
<namePart type="family">Kementchedjhieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>We introduce JEEM, a benchmark designed to evaluate Vision-Language Models (VLMs) on visual understanding across four Arabic-speaking countries: Jordan, The Emirates, Egypt, and Morocco. JEEM includes the tasks of image captioning and visual question answering, and features culturally rich and regionally diverse content. This dataset aims to assess the ability of VLMs to generalize across dialects and accurately interpret cultural elements in visual contexts. In an evaluation of five prominent open-source Arabic VLMs and GPT-4o, we find that the Arabic VLMs consistently underperform, struggling with both visual understanding and dialect-specific generation. While GPT-4o ranks best in this comparison, the model’s linguistic competence varies across dialects, and its visual understanding capabilities lag behind. This underscores the need for more inclusive models and the value of culturally-diverse evaluation paradigms.</abstract>
<identifier type="citekey">kadaoui-etal-2026-jeem</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.18/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>331</start>
<end>354</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JEEM: Vision-Language Understanding in Four Arabic Dialects
%A Kadaoui, Karima
%A Atwany, Hanin
%A Al-Ali, Hamdan
%A Mohamed, Abdelrahman
%A Mekky, Ali
%A Tilga, Sergei
%A Fedorova, Natalia
%A Artemova, Ekaterina
%A Aldarmaki, Hanan
%A Kementchedjhieva, Yova
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F kadaoui-etal-2026-jeem
%X We introduce JEEM, a benchmark designed to evaluate Vision-Language Models (VLMs) on visual understanding across four Arabic-speaking countries: Jordan, The Emirates, Egypt, and Morocco. JEEM includes the tasks of image captioning and visual question answering, and features culturally rich and regionally diverse content. This dataset aims to assess the ability of VLMs to generalize across dialects and accurately interpret cultural elements in visual contexts. In an evaluation of five prominent open-source Arabic VLMs and GPT-4o, we find that the Arabic VLMs consistently underperform, struggling with both visual understanding and dialect-specific generation. While GPT-4o ranks best in this comparison, the model’s linguistic competence varies across dialects, and its visual understanding capabilities lag behind. This underscores the need for more inclusive models and the value of culturally-diverse evaluation paradigms.
%U https://aclanthology.org/2026.findings-eacl.18/
%P 331-354
Markdown (Informal)
[JEEM: Vision-Language Understanding in Four Arabic Dialects](https://aclanthology.org/2026.findings-eacl.18/) (Kadaoui et al., Findings 2026)
ACL
- Karima Kadaoui, Hanin Atwany, Hamdan Al-Ali, Abdelrahman Mohamed, Ali Mekky, Sergei Tilga, Natalia Fedorova, Ekaterina Artemova, Hanan Aldarmaki, and Yova Kementchedjhieva. 2026. JEEM: Vision-Language Understanding in Four Arabic Dialects. In Findings of the Association for Computational Linguistics: EACL 2026, pages 331–354, Rabat, Morocco. Association for Computational Linguistics.