@inproceedings{belayachi-mazroui-2026-alkhalil,
title = "Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for {M}odern {S}tandard {A}rabic",
author = "Belayachi, Samir and
Mazroui, Azzeddine",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.27/",
pages = "192--197",
abstract = "The availability of large annotated corpora remains a major challenge for the development of natural language processing systems for under-resourced languages such as Arabic. In this paper, we present two annotated corpora dedicated to Modern Standard Arabic. These corpora are open-source and freely available on the Hugging Face platform. The first corpus, annotated by theme and designed to provide a balanced representation of contemporary Arabic usage, comprises approximately 76 million words collected from diverse sources covering multiple domains and geographical regions. The second corpus, containing approximately one million words, is a sub-corpus extracted from the first. It was annotated with lemma tags using a semi-automatic approach that combines automatic annotation with the Alkhalil lemmatizer and MADAMIRA, followed by manual validation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="belayachi-mazroui-2026-alkhalil">
<titleInfo>
<title>Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for Modern Standard Arabic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samir</namePart>
<namePart type="family">Belayachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Azzeddine</namePart>
<namePart type="family">Mazroui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The availability of large annotated corpora remains a major challenge for the development of natural language processing systems for under-resourced languages such as Arabic. In this paper, we present two annotated corpora dedicated to Modern Standard Arabic. These corpora are open-source and freely available on the Hugging Face platform. The first corpus, annotated by theme and designed to provide a balanced representation of contemporary Arabic usage, comprises approximately 76 million words collected from diverse sources covering multiple domains and geographical regions. The second corpus, containing approximately one million words, is a sub-corpus extracted from the first. It was annotated with lemma tags using a semi-automatic approach that combines automatic annotation with the Alkhalil lemmatizer and MADAMIRA, followed by manual validation.</abstract>
<identifier type="citekey">belayachi-mazroui-2026-alkhalil</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.27/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>192</start>
<end>197</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for Modern Standard Arabic
%A Belayachi, Samir
%A Mazroui, Azzeddine
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F belayachi-mazroui-2026-alkhalil
%X The availability of large annotated corpora remains a major challenge for the development of natural language processing systems for under-resourced languages such as Arabic. In this paper, we present two annotated corpora dedicated to Modern Standard Arabic. These corpora are open-source and freely available on the Hugging Face platform. The first corpus, annotated by theme and designed to provide a balanced representation of contemporary Arabic usage, comprises approximately 76 million words collected from diverse sources covering multiple domains and geographical regions. The second corpus, containing approximately one million words, is a sub-corpus extracted from the first. It was annotated with lemma tags using a semi-automatic approach that combines automatic annotation with the Alkhalil lemmatizer and MADAMIRA, followed by manual validation.
%U https://aclanthology.org/2026.abjadnlp-1.27/
%P 192-197
Markdown (Informal)
[Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for Modern Standard Arabic](https://aclanthology.org/2026.abjadnlp-1.27/) (Belayachi & Mazroui, AbjadNLP 2026)
ACL