@inproceedings{alfattni-2026-corpus,
title = "A Corpus-Based Investigation of Contemporary {A}rabic Dialects Using the {SADA} Corpus",
author = "Alfattni, Ghada",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.35/",
pages = "276--286",
abstract = "The spoken Arabic exhibits substantial dialectal variation in the Arabic-speaking world. This paper presents a corpus-based analysis of Arabic dialectal variation using the SADA corpus, examining lexical, morphosyntactic, and discourse-pragmatic patterns across dialects. We combine quantitative frequency-based measures with qualitative linguistic analysis, including keyword comparison, distributional profiling, collocational and trigram analyses, and similarity-based clustering. Our results show that Arabic dialects share a substantial common core, while differing systematically in frequent discourse markers, evaluative expressions, and recurrent phraseological patterns. These findings provide empirical evidence for regional clustering among contemporary dialects and for variation relative to the standard register. The study contributes linguistic insights that support both Arabic dialectology and the development of dialect-aware NLP systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alfattni-2026-corpus">
<titleInfo>
<title>A Corpus-Based Investigation of Contemporary Arabic Dialects Using the SADA Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ghada</namePart>
<namePart type="family">Alfattni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The spoken Arabic exhibits substantial dialectal variation in the Arabic-speaking world. This paper presents a corpus-based analysis of Arabic dialectal variation using the SADA corpus, examining lexical, morphosyntactic, and discourse-pragmatic patterns across dialects. We combine quantitative frequency-based measures with qualitative linguistic analysis, including keyword comparison, distributional profiling, collocational and trigram analyses, and similarity-based clustering. Our results show that Arabic dialects share a substantial common core, while differing systematically in frequent discourse markers, evaluative expressions, and recurrent phraseological patterns. These findings provide empirical evidence for regional clustering among contemporary dialects and for variation relative to the standard register. The study contributes linguistic insights that support both Arabic dialectology and the development of dialect-aware NLP systems.</abstract>
<identifier type="citekey">alfattni-2026-corpus</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.35/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>276</start>
<end>286</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Corpus-Based Investigation of Contemporary Arabic Dialects Using the SADA Corpus
%A Alfattni, Ghada
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F alfattni-2026-corpus
%X The spoken Arabic exhibits substantial dialectal variation in the Arabic-speaking world. This paper presents a corpus-based analysis of Arabic dialectal variation using the SADA corpus, examining lexical, morphosyntactic, and discourse-pragmatic patterns across dialects. We combine quantitative frequency-based measures with qualitative linguistic analysis, including keyword comparison, distributional profiling, collocational and trigram analyses, and similarity-based clustering. Our results show that Arabic dialects share a substantial common core, while differing systematically in frequent discourse markers, evaluative expressions, and recurrent phraseological patterns. These findings provide empirical evidence for regional clustering among contemporary dialects and for variation relative to the standard register. The study contributes linguistic insights that support both Arabic dialectology and the development of dialect-aware NLP systems.
%U https://aclanthology.org/2026.abjadnlp-1.35/
%P 276-286
Markdown (Informal)
[A Corpus-Based Investigation of Contemporary Arabic Dialects Using the SADA Corpus](https://aclanthology.org/2026.abjadnlp-1.35/) (Alfattni, AbjadNLP 2026)
ACL