@inproceedings{abuhaija-etal-2025-nakba,
title = "The Nakba Lexicon: Building a Comprehensive Dataset from Palestinian Literature",
author = "AbuHaija, Izza and
Al Mandhari, Salim and
El-Haj, Mo and
Sibony, Jonas and
Rayson, Paul",
editor = "Jarrar, Mustafa and
Habash, Habash and
El-Haj, Mo",
booktitle = "Proceedings of the first International Workshop on Nakba Narratives as Language Resources",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nakbanlp-1.5/",
pages = "37--47",
abstract = "This paper introduces the Nakba Lexicon, a comprehensive dataset derived from the poetry collection \textit{Asifa {\textquoteleft}Ala al-Iz{\textquoteleft}aj} (Sorry for the Disturbance) by Istiqlal Eid, a Palestinian poet from El-Birweh. Eid`s work poignantly reflects on themes of Palestinian identity, displacement, and resilience, serving as a resource for preserving linguistic and cultural heritage in the context of post-Nakba literature. The dataset is structured into ten thematic domains, including political terminology, memory and preservation, sensory and emotional lexicon, toponyms, nature, and external linguistic influences such as Hebrew, French, and English, thereby capturing the socio-political, emotional, and cultural dimensions of the Nakba. The Nakba Lexicon uniquely emphasises the contributions of women to Palestinian literary traditions, shedding light on often-overlooked narratives of resilience and cultural continuity. Advanced Natural Language Processing (NLP) techniques were employed to analyse the dataset, with fine-tuned pre-trained models such as ARABERT and MARBERT achieving F1-scores of 0.87 and 0.68 in language and lexical classification tasks, respectively, significantly outperforming traditional machine learning models. These results highlight the potential of domain-specific computational models to effectively analyse complex datasets, facilitating the preservation of marginalised voices. By bridging computational methods with cultural preservation, this study enhances the understanding of Palestinian linguistic heritage and contributes to broader efforts in documenting and analysing endangered narratives. The Nakba Lexicon paves the way for future interdisciplinary research, showcasing the role of NLP in addressing historical trauma, resilience, and cultural identity."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abuhaija-etal-2025-nakba">
<titleInfo>
<title>The Nakba Lexicon: Building a Comprehensive Dataset from Palestinian Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Izza</namePart>
<namePart type="family">AbuHaija</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salim</namePart>
<namePart type="family">Al Mandhari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Sibony</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the first International Workshop on Nakba Narratives as Language Resources</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Habash</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the Nakba Lexicon, a comprehensive dataset derived from the poetry collection Asifa ‘Ala al-Iz‘aj (Sorry for the Disturbance) by Istiqlal Eid, a Palestinian poet from El-Birweh. Eid‘s work poignantly reflects on themes of Palestinian identity, displacement, and resilience, serving as a resource for preserving linguistic and cultural heritage in the context of post-Nakba literature. The dataset is structured into ten thematic domains, including political terminology, memory and preservation, sensory and emotional lexicon, toponyms, nature, and external linguistic influences such as Hebrew, French, and English, thereby capturing the socio-political, emotional, and cultural dimensions of the Nakba. The Nakba Lexicon uniquely emphasises the contributions of women to Palestinian literary traditions, shedding light on often-overlooked narratives of resilience and cultural continuity. Advanced Natural Language Processing (NLP) techniques were employed to analyse the dataset, with fine-tuned pre-trained models such as ARABERT and MARBERT achieving F1-scores of 0.87 and 0.68 in language and lexical classification tasks, respectively, significantly outperforming traditional machine learning models. These results highlight the potential of domain-specific computational models to effectively analyse complex datasets, facilitating the preservation of marginalised voices. By bridging computational methods with cultural preservation, this study enhances the understanding of Palestinian linguistic heritage and contributes to broader efforts in documenting and analysing endangered narratives. The Nakba Lexicon paves the way for future interdisciplinary research, showcasing the role of NLP in addressing historical trauma, resilience, and cultural identity.</abstract>
<identifier type="citekey">abuhaija-etal-2025-nakba</identifier>
<location>
<url>https://aclanthology.org/2025.nakbanlp-1.5/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>37</start>
<end>47</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Nakba Lexicon: Building a Comprehensive Dataset from Palestinian Literature
%A AbuHaija, Izza
%A Al Mandhari, Salim
%A El-Haj, Mo
%A Sibony, Jonas
%A Rayson, Paul
%Y Jarrar, Mustafa
%Y Habash, Habash
%Y El-Haj, Mo
%S Proceedings of the first International Workshop on Nakba Narratives as Language Resources
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F abuhaija-etal-2025-nakba
%X This paper introduces the Nakba Lexicon, a comprehensive dataset derived from the poetry collection Asifa ‘Ala al-Iz‘aj (Sorry for the Disturbance) by Istiqlal Eid, a Palestinian poet from El-Birweh. Eid‘s work poignantly reflects on themes of Palestinian identity, displacement, and resilience, serving as a resource for preserving linguistic and cultural heritage in the context of post-Nakba literature. The dataset is structured into ten thematic domains, including political terminology, memory and preservation, sensory and emotional lexicon, toponyms, nature, and external linguistic influences such as Hebrew, French, and English, thereby capturing the socio-political, emotional, and cultural dimensions of the Nakba. The Nakba Lexicon uniquely emphasises the contributions of women to Palestinian literary traditions, shedding light on often-overlooked narratives of resilience and cultural continuity. Advanced Natural Language Processing (NLP) techniques were employed to analyse the dataset, with fine-tuned pre-trained models such as ARABERT and MARBERT achieving F1-scores of 0.87 and 0.68 in language and lexical classification tasks, respectively, significantly outperforming traditional machine learning models. These results highlight the potential of domain-specific computational models to effectively analyse complex datasets, facilitating the preservation of marginalised voices. By bridging computational methods with cultural preservation, this study enhances the understanding of Palestinian linguistic heritage and contributes to broader efforts in documenting and analysing endangered narratives. The Nakba Lexicon paves the way for future interdisciplinary research, showcasing the role of NLP in addressing historical trauma, resilience, and cultural identity.
%U https://aclanthology.org/2025.nakbanlp-1.5/
%P 37-47
Markdown (Informal)
[The Nakba Lexicon: Building a Comprehensive Dataset from Palestinian Literature](https://aclanthology.org/2025.nakbanlp-1.5/) (AbuHaija et al., NakbaNLP 2025)
ACL