@inproceedings{ghaboura-etal-2025-camel,
title = "{CAMEL}-Bench: A Comprehensive {A}rabic {LMM} Benchmark",
author = "Ghaboura, Sara and
Heakl, Ahmed and
Thawakar, Omkar and
Alharthi, Ali Husain Salem Abdulla and
Riahi, Ines and
Radman, Abduljalil and
Laaksonen, Jorma and
Khan, Fahad Shahbaz and
Khan, Salman and
Anwer, Rao Muhammad",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.105/",
doi = "10.18653/v1/2025.findings-naacl.105",
pages = "1970--1980",
ISBN = "979-8-89176-195-7",
abstract = "Recent years have witnessed a significant interest in developing large multi-modal models (LMMs) capable of performing various visual reasoning and understanding tasks. This has led to the introduction of multiple LMM benchmarks to evaluate LMMs on different tasks. However, most existing LMM evaluation benchmarks are predominantly English-centric. In this work, we develop a comprehensive LMM evaluation benchmark for the Arabic language to represent a large population of over 400 million speakers. The proposed benchmark, named CAMEL-Bench, comprises eight diverse domains and 38 sub-domains including, multi-image understanding, complex visual perception, handwritten document understanding, video understanding, medical imaging, plant diseases, and remote sensing-based land use understanding to evaluate broad scenario generalizability. Our CAMEL-Bench comprises around 29,036 questions that are filtered from a larger pool of samples, where the quality is manually verified by native speakers to ensure reliable model assessment. We conduct evaluations of both closed-source, including GPT-4 series, and open-source LMMs. Our analysis reveals the need for substantial improvement, especially among the bestopen-source models, with even the closed-source GPT-4o achieving an overall score of 62{\%}. Our benchmark will be publicly released."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghaboura-etal-2025-camel">
<titleInfo>
<title>CAMEL-Bench: A Comprehensive Arabic LMM Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Ghaboura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Heakl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omkar</namePart>
<namePart type="family">Thawakar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="given">Husain</namePart>
<namePart type="given">Salem</namePart>
<namePart type="given">Abdulla</namePart>
<namePart type="family">Alharthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ines</namePart>
<namePart type="family">Riahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abduljalil</namePart>
<namePart type="family">Radman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorma</namePart>
<namePart type="family">Laaksonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fahad</namePart>
<namePart type="given">Shahbaz</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salman</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rao</namePart>
<namePart type="given">Muhammad</namePart>
<namePart type="family">Anwer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Recent years have witnessed a significant interest in developing large multi-modal models (LMMs) capable of performing various visual reasoning and understanding tasks. This has led to the introduction of multiple LMM benchmarks to evaluate LMMs on different tasks. However, most existing LMM evaluation benchmarks are predominantly English-centric. In this work, we develop a comprehensive LMM evaluation benchmark for the Arabic language to represent a large population of over 400 million speakers. The proposed benchmark, named CAMEL-Bench, comprises eight diverse domains and 38 sub-domains including, multi-image understanding, complex visual perception, handwritten document understanding, video understanding, medical imaging, plant diseases, and remote sensing-based land use understanding to evaluate broad scenario generalizability. Our CAMEL-Bench comprises around 29,036 questions that are filtered from a larger pool of samples, where the quality is manually verified by native speakers to ensure reliable model assessment. We conduct evaluations of both closed-source, including GPT-4 series, and open-source LMMs. Our analysis reveals the need for substantial improvement, especially among the bestopen-source models, with even the closed-source GPT-4o achieving an overall score of 62%. Our benchmark will be publicly released.</abstract>
<identifier type="citekey">ghaboura-etal-2025-camel</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.105</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.105/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1970</start>
<end>1980</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CAMEL-Bench: A Comprehensive Arabic LMM Benchmark
%A Ghaboura, Sara
%A Heakl, Ahmed
%A Thawakar, Omkar
%A Alharthi, Ali Husain Salem Abdulla
%A Riahi, Ines
%A Radman, Abduljalil
%A Laaksonen, Jorma
%A Khan, Fahad Shahbaz
%A Khan, Salman
%A Anwer, Rao Muhammad
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F ghaboura-etal-2025-camel
%X Recent years have witnessed a significant interest in developing large multi-modal models (LMMs) capable of performing various visual reasoning and understanding tasks. This has led to the introduction of multiple LMM benchmarks to evaluate LMMs on different tasks. However, most existing LMM evaluation benchmarks are predominantly English-centric. In this work, we develop a comprehensive LMM evaluation benchmark for the Arabic language to represent a large population of over 400 million speakers. The proposed benchmark, named CAMEL-Bench, comprises eight diverse domains and 38 sub-domains including, multi-image understanding, complex visual perception, handwritten document understanding, video understanding, medical imaging, plant diseases, and remote sensing-based land use understanding to evaluate broad scenario generalizability. Our CAMEL-Bench comprises around 29,036 questions that are filtered from a larger pool of samples, where the quality is manually verified by native speakers to ensure reliable model assessment. We conduct evaluations of both closed-source, including GPT-4 series, and open-source LMMs. Our analysis reveals the need for substantial improvement, especially among the bestopen-source models, with even the closed-source GPT-4o achieving an overall score of 62%. Our benchmark will be publicly released.
%R 10.18653/v1/2025.findings-naacl.105
%U https://aclanthology.org/2025.findings-naacl.105/
%U https://doi.org/10.18653/v1/2025.findings-naacl.105
%P 1970-1980
Markdown (Informal)
[CAMEL-Bench: A Comprehensive Arabic LMM Benchmark](https://aclanthology.org/2025.findings-naacl.105/) (Ghaboura et al., Findings 2025)
ACL
- Sara Ghaboura, Ahmed Heakl, Omkar Thawakar, Ali Husain Salem Abdulla Alharthi, Ines Riahi, Abduljalil Radman, Jorma Laaksonen, Fahad Shahbaz Khan, Salman Khan, and Rao Muhammad Anwer. 2025. CAMEL-Bench: A Comprehensive Arabic LMM Benchmark. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 1970–1980, Albuquerque, New Mexico. Association for Computational Linguistics.