@inproceedings{zbib-etal-2026-aralingbench,
title = "{A}ra{L}ing{B}ench: A Human-Annotated Benchmark for Evaluating {A}rabic Linguistic Capabilities of Large Language Models",
author = "Zbib, Mohamad Bilal and
Hammoud, Hasan Abed Al Kader and
Mohanna, Ammar and
Rizk, Nadine and
Karnib, Fatima and
Moukaled, Sina and
Ghanem, Bernard",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.45/",
pages = "385--393",
abstract = "We present AraLingBench, a fully human annotated benchmark for evaluating the Arabic linguistic competence of large language mod- els (LLMs). The benchmark spans five core categories: grammar, morphology, spelling, reading comprehension, and syntax, through 150 expert designed multiple choice questions that directly assess structural language understanding. Evaluating 35 Arabic and bilingual LLMs reveals that current models demonstrate strong surface level proficiency but struggle with deeper grammatical and syntactic reasoning. AraLingBench highlights a persistent gap between high scores on knowledge-based benchmarks and true linguistic mastery, showing that many models succeed through memorization or pattern recognition rather than au- thentic comprehension. By isolating and measuring fundamental linguistic skills, AraLingBench provides a diagnostic framework for developing Arabic LLMs. The benchmark and evaluation code are available on Hugging Face and GitHub."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zbib-etal-2026-aralingbench">
<titleInfo>
<title>AraLingBench: A Human-Annotated Benchmark for Evaluating Arabic Linguistic Capabilities of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamad</namePart>
<namePart type="given">Bilal</namePart>
<namePart type="family">Zbib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hasan</namePart>
<namePart type="given">Abed</namePart>
<namePart type="given">Al</namePart>
<namePart type="given">Kader</namePart>
<namePart type="family">Hammoud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ammar</namePart>
<namePart type="family">Mohanna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadine</namePart>
<namePart type="family">Rizk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatima</namePart>
<namePart type="family">Karnib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Moukaled</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernard</namePart>
<namePart type="family">Ghanem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present AraLingBench, a fully human annotated benchmark for evaluating the Arabic linguistic competence of large language mod- els (LLMs). The benchmark spans five core categories: grammar, morphology, spelling, reading comprehension, and syntax, through 150 expert designed multiple choice questions that directly assess structural language understanding. Evaluating 35 Arabic and bilingual LLMs reveals that current models demonstrate strong surface level proficiency but struggle with deeper grammatical and syntactic reasoning. AraLingBench highlights a persistent gap between high scores on knowledge-based benchmarks and true linguistic mastery, showing that many models succeed through memorization or pattern recognition rather than au- thentic comprehension. By isolating and measuring fundamental linguistic skills, AraLingBench provides a diagnostic framework for developing Arabic LLMs. The benchmark and evaluation code are available on Hugging Face and GitHub.</abstract>
<identifier type="citekey">zbib-etal-2026-aralingbench</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.45/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>385</start>
<end>393</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AraLingBench: A Human-Annotated Benchmark for Evaluating Arabic Linguistic Capabilities of Large Language Models
%A Zbib, Mohamad Bilal
%A Hammoud, Hasan Abed Al Kader
%A Mohanna, Ammar
%A Rizk, Nadine
%A Karnib, Fatima
%A Moukaled, Sina
%A Ghanem, Bernard
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F zbib-etal-2026-aralingbench
%X We present AraLingBench, a fully human annotated benchmark for evaluating the Arabic linguistic competence of large language mod- els (LLMs). The benchmark spans five core categories: grammar, morphology, spelling, reading comprehension, and syntax, through 150 expert designed multiple choice questions that directly assess structural language understanding. Evaluating 35 Arabic and bilingual LLMs reveals that current models demonstrate strong surface level proficiency but struggle with deeper grammatical and syntactic reasoning. AraLingBench highlights a persistent gap between high scores on knowledge-based benchmarks and true linguistic mastery, showing that many models succeed through memorization or pattern recognition rather than au- thentic comprehension. By isolating and measuring fundamental linguistic skills, AraLingBench provides a diagnostic framework for developing Arabic LLMs. The benchmark and evaluation code are available on Hugging Face and GitHub.
%U https://aclanthology.org/2026.abjadnlp-1.45/
%P 385-393
Markdown (Informal)
[AraLingBench: A Human-Annotated Benchmark for Evaluating Arabic Linguistic Capabilities of Large Language Models](https://aclanthology.org/2026.abjadnlp-1.45/) (Zbib et al., AbjadNLP 2026)
ACL