@inproceedings{bondok-etal-2025-proper,
title = "Proper Noun Diacritization for {A}rabic {W}ikipedia: A Benchmark Dataset",
author = "Bondok, Rawan and
Nassar, Mayar and
Khalifa, Salam and
Micallef, Kurt and
Habash, Nizar",
editor = "Arora, Akhil and
Johnson, Isaac and
Kaffee, Lucie-Aim{\'e}e and
Kuo, Tzu-Sheng and
Piccardi, Tiziano and
Sen, Indira",
booktitle = "Proceedings of the 2nd Workshop on Advancing Natural Language Processing for Wikipedia (WikiNLP 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wikinlp-1.8/",
doi = "10.18653/v1/2025.wikinlp-1.8",
pages = "31--44",
ISBN = "979-8-89176-284-8",
abstract = "Proper nouns in Arabic Wikipedia are frequently undiacritized, creating ambiguity in pronunciation and interpretation, especially for transliterated named entities of foreign origin. While transliteration and diacritization have been well-studied separately in Arabic NLP, their intersection remains underexplored. In this paper, we introduce a new manually diacritized dataset of Arabic proper nouns of various origins with their English Wikipedia equivalent glosses, and present the challenges and guidelines we followed to create it. We benchmark GPT-4o on the task of recovering full diacritization given the undiacritized Arabic and English forms, and analyze its performance. Achieving 73{\%} accuracy, our results underscore both the difficulty of the task and the need for improved models and resources. We release our dataset to facilitate further research on Arabic Wikipedia proper noun diacritization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bondok-etal-2025-proper">
<titleInfo>
<title>Proper Noun Diacritization for Arabic Wikipedia: A Benchmark Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Bondok</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayar</namePart>
<namePart type="family">Nassar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kurt</namePart>
<namePart type="family">Micallef</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Advancing Natural Language Processing for Wikipedia (WikiNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Akhil</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie-Aimée</namePart>
<namePart type="family">Kaffee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tzu-Sheng</namePart>
<namePart type="family">Kuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiziano</namePart>
<namePart type="family">Piccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Indira</namePart>
<namePart type="family">Sen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-284-8</identifier>
</relatedItem>
<abstract>Proper nouns in Arabic Wikipedia are frequently undiacritized, creating ambiguity in pronunciation and interpretation, especially for transliterated named entities of foreign origin. While transliteration and diacritization have been well-studied separately in Arabic NLP, their intersection remains underexplored. In this paper, we introduce a new manually diacritized dataset of Arabic proper nouns of various origins with their English Wikipedia equivalent glosses, and present the challenges and guidelines we followed to create it. We benchmark GPT-4o on the task of recovering full diacritization given the undiacritized Arabic and English forms, and analyze its performance. Achieving 73% accuracy, our results underscore both the difficulty of the task and the need for improved models and resources. We release our dataset to facilitate further research on Arabic Wikipedia proper noun diacritization.</abstract>
<identifier type="citekey">bondok-etal-2025-proper</identifier>
<identifier type="doi">10.18653/v1/2025.wikinlp-1.8</identifier>
<location>
<url>https://aclanthology.org/2025.wikinlp-1.8/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>31</start>
<end>44</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Proper Noun Diacritization for Arabic Wikipedia: A Benchmark Dataset
%A Bondok, Rawan
%A Nassar, Mayar
%A Khalifa, Salam
%A Micallef, Kurt
%A Habash, Nizar
%Y Arora, Akhil
%Y Johnson, Isaac
%Y Kaffee, Lucie-Aimée
%Y Kuo, Tzu-Sheng
%Y Piccardi, Tiziano
%Y Sen, Indira
%S Proceedings of the 2nd Workshop on Advancing Natural Language Processing for Wikipedia (WikiNLP 2025)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-284-8
%F bondok-etal-2025-proper
%X Proper nouns in Arabic Wikipedia are frequently undiacritized, creating ambiguity in pronunciation and interpretation, especially for transliterated named entities of foreign origin. While transliteration and diacritization have been well-studied separately in Arabic NLP, their intersection remains underexplored. In this paper, we introduce a new manually diacritized dataset of Arabic proper nouns of various origins with their English Wikipedia equivalent glosses, and present the challenges and guidelines we followed to create it. We benchmark GPT-4o on the task of recovering full diacritization given the undiacritized Arabic and English forms, and analyze its performance. Achieving 73% accuracy, our results underscore both the difficulty of the task and the need for improved models and resources. We release our dataset to facilitate further research on Arabic Wikipedia proper noun diacritization.
%R 10.18653/v1/2025.wikinlp-1.8
%U https://aclanthology.org/2025.wikinlp-1.8/
%U https://doi.org/10.18653/v1/2025.wikinlp-1.8
%P 31-44
Markdown (Informal)
[Proper Noun Diacritization for Arabic Wikipedia: A Benchmark Dataset](https://aclanthology.org/2025.wikinlp-1.8/) (Bondok et al., WikiNLP 2025)
ACL