@inproceedings{zaghouani-2026-arabic,
title = "{A}rabic Humor as a Diagnostic Probe for Large Language Models",
author = "Zaghouani, Wajdi",
editor = "Amir, Ori and
Hempelmann, Christian F. and
Rayz, Julia and
Dong, Tiansi and
Miller, Tristan",
booktitle = "Proceedings of the 2nd Workshop on Computational Humor ({CH}um 2026)",
month = jul,
year = "2026",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.chum-1.3/",
pages = "39--50",
ISBN = "979-8-89176-431-6",
abstract = "Arabic humor provides a challenging diagnostic test for large language models because interpreting jokes often requires pragmatic inference, sociolinguistic awareness, and culturally grounded knowledge that standard NLP benchmarks do not evaluate. Arabic is particularly suitable for probing these abilities given its diglossic structure and dialect diversity, where humor frequently arises from register contrast, dialect-specific vocabulary, and shared cultural references. We propose a three-layer taxonomy of Arabic humor mechanisms covering pragmatic, semantic, and sociolinguistic phenomena, illustrated through thirteen curated examples spanning Egyptian, Levantine, Gulf, Tunisian, and Iraqi Arabic. Building on this taxonomy, we introduce a diagnostic evaluation framework using contrastive minimal pairs, a multi-dimensional scoring rubric, and a cultural presupposition ontology. A small proof-of-concept probing study with GPT-4o, Gemini 2.0 Flash, and Claude Sonnet 4.5 reveals recurring failure patterns in sarcasm interpretation, register contrast reasoning, dialectal vocabulary coverage, and cultural grounding. We position this work as a diagnostic framework and pilot, not a mature benchmark, and outline a path toward larger annotated resources."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zaghouani-2026-arabic">
<titleInfo>
<title>Arabic Humor as a Diagnostic Probe for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Computational Humor (CHum 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ori</namePart>
<namePart type="family">Amir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Hempelmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Rayz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiansi</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Miller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-431-6</identifier>
</relatedItem>
<abstract>Arabic humor provides a challenging diagnostic test for large language models because interpreting jokes often requires pragmatic inference, sociolinguistic awareness, and culturally grounded knowledge that standard NLP benchmarks do not evaluate. Arabic is particularly suitable for probing these abilities given its diglossic structure and dialect diversity, where humor frequently arises from register contrast, dialect-specific vocabulary, and shared cultural references. We propose a three-layer taxonomy of Arabic humor mechanisms covering pragmatic, semantic, and sociolinguistic phenomena, illustrated through thirteen curated examples spanning Egyptian, Levantine, Gulf, Tunisian, and Iraqi Arabic. Building on this taxonomy, we introduce a diagnostic evaluation framework using contrastive minimal pairs, a multi-dimensional scoring rubric, and a cultural presupposition ontology. A small proof-of-concept probing study with GPT-4o, Gemini 2.0 Flash, and Claude Sonnet 4.5 reveals recurring failure patterns in sarcasm interpretation, register contrast reasoning, dialectal vocabulary coverage, and cultural grounding. We position this work as a diagnostic framework and pilot, not a mature benchmark, and outline a path toward larger annotated resources.</abstract>
<identifier type="citekey">zaghouani-2026-arabic</identifier>
<location>
<url>https://aclanthology.org/2026.chum-1.3/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>39</start>
<end>50</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Arabic Humor as a Diagnostic Probe for Large Language Models
%A Zaghouani, Wajdi
%Y Amir, Ori
%Y Hempelmann, Christian F.
%Y Rayz, Julia
%Y Dong, Tiansi
%Y Miller, Tristan
%S Proceedings of the 2nd Workshop on Computational Humor (CHum 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C Online
%@ 979-8-89176-431-6
%F zaghouani-2026-arabic
%X Arabic humor provides a challenging diagnostic test for large language models because interpreting jokes often requires pragmatic inference, sociolinguistic awareness, and culturally grounded knowledge that standard NLP benchmarks do not evaluate. Arabic is particularly suitable for probing these abilities given its diglossic structure and dialect diversity, where humor frequently arises from register contrast, dialect-specific vocabulary, and shared cultural references. We propose a three-layer taxonomy of Arabic humor mechanisms covering pragmatic, semantic, and sociolinguistic phenomena, illustrated through thirteen curated examples spanning Egyptian, Levantine, Gulf, Tunisian, and Iraqi Arabic. Building on this taxonomy, we introduce a diagnostic evaluation framework using contrastive minimal pairs, a multi-dimensional scoring rubric, and a cultural presupposition ontology. A small proof-of-concept probing study with GPT-4o, Gemini 2.0 Flash, and Claude Sonnet 4.5 reveals recurring failure patterns in sarcasm interpretation, register contrast reasoning, dialectal vocabulary coverage, and cultural grounding. We position this work as a diagnostic framework and pilot, not a mature benchmark, and outline a path toward larger annotated resources.
%U https://aclanthology.org/2026.chum-1.3/
%P 39-50
Markdown (Informal)
[Arabic Humor as a Diagnostic Probe for Large Language Models](https://aclanthology.org/2026.chum-1.3/) (Zaghouani, chum 2026)
ACL