@inproceedings{lamsiyah-etal-2025-arabicsense,
title = "{A}rabic{S}ense: A Benchmark for Evaluating Commonsense Reasoning in {A}rabic with Large Language Models",
author = "Lamsiyah, Salima and
Zeinalipour, Kamyar and
El amrany, Samir and
Brust, Matthias and
Maggini, Marco and
Bouvry, Pascal and
Schommer, Christoph",
editor = "Ezzini, Saad and
Alami, Hamza and
Berrada, Ismail and
Benlahbib, Abdessamad and
El Mahdaouy, Abdelkader and
Lamsiyah, Salima and
Derrouz, Hatim and
Haddad Haddad, Amal and
Jarrar, Mustafa and
El-Haj, Mo and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wacl-1.1/",
pages = "1--11",
abstract = "Recent efforts in natural language processing (NLP) commonsense reasoning research have led to the development of numerous new datasets and benchmarks. However, these resources have predominantly been limited to English, leaving a gap in evaluating commonsense reasoning in other languages. In this paper, we introduce the ArabicSense Benchmark, which is designed to thoroughly evaluate the world-knowledge commonsense reasoning abilities of large language models (LLMs) in Arabic. This benchmark includes three main tasks: first, it tests whether a system can distinguish between natural language statements that make sense and those that do not; second, it requires a system to identify the most crucial reason why a nonsensical statement fails to make sense; and third, it involves generating explanations for why statements do not make sense. We evaluate several Arabic BERT-based models and causal LLMs on these tasks. Experimental results demonstrate improvements after fine-tuning on our dataset. For instance, AraBERT v2 achieved an 87{\%} F1 score on the second task, while Gemma and Mistral-7b achieved F1 scores of 95.5{\%} and 94.8{\%}, respectively. For the generation task, LLaMA-3 achieved the best performance with a BERTScore F1 of 77.3{\%}, closely followed by Mistral-7b at 77.1{\%}. All codes and the benchmark will be made publicly available at https://github.com/."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lamsiyah-etal-2025-arabicsense">
<titleInfo>
<title>ArabicSense: A Benchmark for Evaluating Commonsense Reasoning in Arabic with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Salima</namePart>
<namePart type="family">Lamsiyah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kamyar</namePart>
<namePart type="family">Zeinalipour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samir</namePart>
<namePart type="family">El amrany</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Brust</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Maggini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pascal</namePart>
<namePart type="family">Bouvry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Schommer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Ezzini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamza</namePart>
<namePart type="family">Alami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ismail</namePart>
<namePart type="family">Berrada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdessamad</namePart>
<namePart type="family">Benlahbib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelkader</namePart>
<namePart type="family">El Mahdaouy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salima</namePart>
<namePart type="family">Lamsiyah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatim</namePart>
<namePart type="family">Derrouz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amal</namePart>
<namePart type="family">Haddad Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent efforts in natural language processing (NLP) commonsense reasoning research have led to the development of numerous new datasets and benchmarks. However, these resources have predominantly been limited to English, leaving a gap in evaluating commonsense reasoning in other languages. In this paper, we introduce the ArabicSense Benchmark, which is designed to thoroughly evaluate the world-knowledge commonsense reasoning abilities of large language models (LLMs) in Arabic. This benchmark includes three main tasks: first, it tests whether a system can distinguish between natural language statements that make sense and those that do not; second, it requires a system to identify the most crucial reason why a nonsensical statement fails to make sense; and third, it involves generating explanations for why statements do not make sense. We evaluate several Arabic BERT-based models and causal LLMs on these tasks. Experimental results demonstrate improvements after fine-tuning on our dataset. For instance, AraBERT v2 achieved an 87% F1 score on the second task, while Gemma and Mistral-7b achieved F1 scores of 95.5% and 94.8%, respectively. For the generation task, LLaMA-3 achieved the best performance with a BERTScore F1 of 77.3%, closely followed by Mistral-7b at 77.1%. All codes and the benchmark will be made publicly available at https://github.com/.</abstract>
<identifier type="citekey">lamsiyah-etal-2025-arabicsense</identifier>
<location>
<url>https://aclanthology.org/2025.wacl-1.1/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ArabicSense: A Benchmark for Evaluating Commonsense Reasoning in Arabic with Large Language Models
%A Lamsiyah, Salima
%A Zeinalipour, Kamyar
%A El amrany, Samir
%A Brust, Matthias
%A Maggini, Marco
%A Bouvry, Pascal
%A Schommer, Christoph
%Y Ezzini, Saad
%Y Alami, Hamza
%Y Berrada, Ismail
%Y Benlahbib, Abdessamad
%Y El Mahdaouy, Abdelkader
%Y Lamsiyah, Salima
%Y Derrouz, Hatim
%Y Haddad Haddad, Amal
%Y Jarrar, Mustafa
%Y El-Haj, Mo
%Y Mitkov, Ruslan
%Y Rayson, Paul
%S Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F lamsiyah-etal-2025-arabicsense
%X Recent efforts in natural language processing (NLP) commonsense reasoning research have led to the development of numerous new datasets and benchmarks. However, these resources have predominantly been limited to English, leaving a gap in evaluating commonsense reasoning in other languages. In this paper, we introduce the ArabicSense Benchmark, which is designed to thoroughly evaluate the world-knowledge commonsense reasoning abilities of large language models (LLMs) in Arabic. This benchmark includes three main tasks: first, it tests whether a system can distinguish between natural language statements that make sense and those that do not; second, it requires a system to identify the most crucial reason why a nonsensical statement fails to make sense; and third, it involves generating explanations for why statements do not make sense. We evaluate several Arabic BERT-based models and causal LLMs on these tasks. Experimental results demonstrate improvements after fine-tuning on our dataset. For instance, AraBERT v2 achieved an 87% F1 score on the second task, while Gemma and Mistral-7b achieved F1 scores of 95.5% and 94.8%, respectively. For the generation task, LLaMA-3 achieved the best performance with a BERTScore F1 of 77.3%, closely followed by Mistral-7b at 77.1%. All codes and the benchmark will be made publicly available at https://github.com/.
%U https://aclanthology.org/2025.wacl-1.1/
%P 1-11
Markdown (Informal)
[ArabicSense: A Benchmark for Evaluating Commonsense Reasoning in Arabic with Large Language Models](https://aclanthology.org/2025.wacl-1.1/) (Lamsiyah et al., WACL 2025)
ACL