@inproceedings{noureldien-etal-2025-zero,
title = "Zero-Shot and Fine-Tuned Evaluation of Generative {LLM}s for {A}rabic Word Sense Disambiguation",
author = "Noureldien, Yossra and
Mohamed, Abdelrazig and
Attallah, Farah",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.24/",
pages = "298--305",
ISBN = "979-8-89176-352-4",
abstract = "Arabic presents unique challenges for sense level language understanding due to its rich morphology and semantic ambiguity. This paper benchmarks large generative language models (LLMs) for Arabic Word Sense Disambiguation (WSD) under both zero-shot and fine-tuning conditions. We evaluate one proprietary model (GPT-4o) and three opensource models (LLaMA 3.1-8B, Qwen 2.5-7B, and Gemma 2-9B) on two publicly available datasets. In zero-shot settings, GPT-4o achieved the highest overall performance, with comparable results across both datasets, reaching 79{\%} accuracy and an average macro-F1 score of 66.08{\%}. Fine-tuning, however, notably elevated all open models beyond GPT4o{'}s zero-shot results. Qwen achieved the top scores on one dataset, with an accuracy of 90.77{\%} and a macro-F1 score of 83.98{\%}, while LLaMA scored highest on the other, reaching an accuracy of 88.51{\%} and a macroF1 score of 69.41{\%}. These findings demonstrate that parameter-efficient supervised adaptation can close much of the performance gap and establish strong, reproducible baselines for Arabic WSD using open-source, relatively medium-sized models. Full code is publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="noureldien-etal-2025-zero">
<titleInfo>
<title>Zero-Shot and Fine-Tuned Evaluation of Generative LLMs for Arabic Word Sense Disambiguation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yossra</namePart>
<namePart type="family">Noureldien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrazig</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farah</namePart>
<namePart type="family">Attallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Third Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharefah</namePart>
<namePart type="family">Al-Ghamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakhar</namePart>
<namePart type="family">Alkhereyf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Areeb</namePart>
<namePart type="family">Alowisheq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waad</namePart>
<namePart type="family">Alshammari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-352-4</identifier>
</relatedItem>
<abstract>Arabic presents unique challenges for sense level language understanding due to its rich morphology and semantic ambiguity. This paper benchmarks large generative language models (LLMs) for Arabic Word Sense Disambiguation (WSD) under both zero-shot and fine-tuning conditions. We evaluate one proprietary model (GPT-4o) and three opensource models (LLaMA 3.1-8B, Qwen 2.5-7B, and Gemma 2-9B) on two publicly available datasets. In zero-shot settings, GPT-4o achieved the highest overall performance, with comparable results across both datasets, reaching 79% accuracy and an average macro-F1 score of 66.08%. Fine-tuning, however, notably elevated all open models beyond GPT4o’s zero-shot results. Qwen achieved the top scores on one dataset, with an accuracy of 90.77% and a macro-F1 score of 83.98%, while LLaMA scored highest on the other, reaching an accuracy of 88.51% and a macroF1 score of 69.41%. These findings demonstrate that parameter-efficient supervised adaptation can close much of the performance gap and establish strong, reproducible baselines for Arabic WSD using open-source, relatively medium-sized models. Full code is publicly available.</abstract>
<identifier type="citekey">noureldien-etal-2025-zero</identifier>
<location>
<url>https://aclanthology.org/2025.arabicnlp-main.24/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>298</start>
<end>305</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Zero-Shot and Fine-Tuned Evaluation of Generative LLMs for Arabic Word Sense Disambiguation
%A Noureldien, Yossra
%A Mohamed, Abdelrazig
%A Attallah, Farah
%Y Darwish, Kareem
%Y Ali, Ahmed
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%Y Zitouni, Imed
%Y Abdelali, Ahmed
%Y Al-Ghamdi, Sharefah
%Y Alkhereyf, Sakhar
%Y Zaghouani, Wajdi
%Y Khalifa, Salam
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Hamed, Injy
%Y Alyafeai, Zaid
%Y Alowisheq, Areeb
%Y Inoue, Go
%Y Mrini, Khalil
%Y Alshammari, Waad
%S Proceedings of The Third Arabic Natural Language Processing Conference
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-352-4
%F noureldien-etal-2025-zero
%X Arabic presents unique challenges for sense level language understanding due to its rich morphology and semantic ambiguity. This paper benchmarks large generative language models (LLMs) for Arabic Word Sense Disambiguation (WSD) under both zero-shot and fine-tuning conditions. We evaluate one proprietary model (GPT-4o) and three opensource models (LLaMA 3.1-8B, Qwen 2.5-7B, and Gemma 2-9B) on two publicly available datasets. In zero-shot settings, GPT-4o achieved the highest overall performance, with comparable results across both datasets, reaching 79% accuracy and an average macro-F1 score of 66.08%. Fine-tuning, however, notably elevated all open models beyond GPT4o’s zero-shot results. Qwen achieved the top scores on one dataset, with an accuracy of 90.77% and a macro-F1 score of 83.98%, while LLaMA scored highest on the other, reaching an accuracy of 88.51% and a macroF1 score of 69.41%. These findings demonstrate that parameter-efficient supervised adaptation can close much of the performance gap and establish strong, reproducible baselines for Arabic WSD using open-source, relatively medium-sized models. Full code is publicly available.
%U https://aclanthology.org/2025.arabicnlp-main.24/
%P 298-305
Markdown (Informal)
[Zero-Shot and Fine-Tuned Evaluation of Generative LLMs for Arabic Word Sense Disambiguation](https://aclanthology.org/2025.arabicnlp-main.24/) (Noureldien et al., ArabicNLP 2025)
ACL