@inproceedings{mohamed-eida-habash-2025-beyond,
title = "Beyond {C}airo: {S}a{'}idi {E}gyptian {A}rabic Literary Corpus Construction and Analysis",
author = "Mohamed Eida, Mai and
Habash, Nizar",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nlp4dh-1.26/",
doi = "10.18653/v1/2025.nlp4dh-1.26",
pages = "292--304",
ISBN = "979-8-89176-234-3",
abstract = "Egyptian Arabic (EA) NLP resources have mainly focused on Cairene Egyptian Arabic (CEA), leaving sub-dialects like Sa{'}idi Egyptian Arabic (SEA) underrepresented. This paper introduces the first SEA corpus {--} an open-source, 4-million-word literary dataset of a dialect spoken by {\textasciitilde}30 million Egyptians. To validate its representation, we analyze SEA-specific linguistic features from dialectal surveys, confirming a higher prevalence in our corpus compared to existing EA datasets. Our findings offer insights into SEA{'}s orthographic representation in morphology, phonology, and lexicon, incorporating CODA* guidelines for normalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohamed-eida-habash-2025-beyond">
<titleInfo>
<title>Beyond Cairo: Sa’idi Egyptian Arabic Literary Corpus Construction and Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mai</namePart>
<namePart type="family">Mohamed Eida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-234-3</identifier>
</relatedItem>
<abstract>Egyptian Arabic (EA) NLP resources have mainly focused on Cairene Egyptian Arabic (CEA), leaving sub-dialects like Sa’idi Egyptian Arabic (SEA) underrepresented. This paper introduces the first SEA corpus – an open-source, 4-million-word literary dataset of a dialect spoken by ~30 million Egyptians. To validate its representation, we analyze SEA-specific linguistic features from dialectal surveys, confirming a higher prevalence in our corpus compared to existing EA datasets. Our findings offer insights into SEA’s orthographic representation in morphology, phonology, and lexicon, incorporating CODA* guidelines for normalization.</abstract>
<identifier type="citekey">mohamed-eida-habash-2025-beyond</identifier>
<identifier type="doi">10.18653/v1/2025.nlp4dh-1.26</identifier>
<location>
<url>https://aclanthology.org/2025.nlp4dh-1.26/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>292</start>
<end>304</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Cairo: Sa’idi Egyptian Arabic Literary Corpus Construction and Analysis
%A Mohamed Eida, Mai
%A Habash, Nizar
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Bizzoni, Yuri
%Y Miyagawa, So
%Y Alnajjar, Khalid
%S Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-234-3
%F mohamed-eida-habash-2025-beyond
%X Egyptian Arabic (EA) NLP resources have mainly focused on Cairene Egyptian Arabic (CEA), leaving sub-dialects like Sa’idi Egyptian Arabic (SEA) underrepresented. This paper introduces the first SEA corpus – an open-source, 4-million-word literary dataset of a dialect spoken by ~30 million Egyptians. To validate its representation, we analyze SEA-specific linguistic features from dialectal surveys, confirming a higher prevalence in our corpus compared to existing EA datasets. Our findings offer insights into SEA’s orthographic representation in morphology, phonology, and lexicon, incorporating CODA* guidelines for normalization.
%R 10.18653/v1/2025.nlp4dh-1.26
%U https://aclanthology.org/2025.nlp4dh-1.26/
%U https://doi.org/10.18653/v1/2025.nlp4dh-1.26
%P 292-304
Markdown (Informal)
[Beyond Cairo: Sa’idi Egyptian Arabic Literary Corpus Construction and Analysis](https://aclanthology.org/2025.nlp4dh-1.26/) (Mohamed Eida & Habash, NLP4DH 2025)
ACL