@inproceedings{zeinalipour-etal-2025-shawarma,
title = "Shawarma Chats: A Benchmark Exact Dialogue {\&} Evaluation Platter in {E}gyptian, {M}aghrebi {\&} Modern Standard {A}rabic{---}{A} Triple-Dialect Feast for Hungry Language Models",
author = "Zeinalipour, Kamyar and
Saad, Mohamed Zaky and
Attafi, Oumaima and
Maggini, Marco and
Gori, Marco",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.39/",
pages = "472--524",
ISBN = "979-8-89176-352-4",
abstract = "Content-grounded dialogue evaluation for Arabic remains under-resourced, particularly across Modern Standard (MSA), Egyptian, and Maghrebi varieties. We introduce Shawarma Chats, a benchmark of 30,000 six-turn conversations grounded in Wikipedia content, evenly split across the three dialects. To build this corpus, we prompt five frontier LLMs GPT-4o, Gemini 2.5 Flash, Qwen-Plus, DeepSeek-Chat, and Mistral Large to generate 1,500 seed dialogues. Native Arabic speakers evaluate these outputs to select the most effective generator and most human-aligned grader. Sub-A dialogues undergo a two-pass, rationale-driven self-repair loop where the grader critiques and the generator revises; unresolved cases are manually corrected. We apply this pipeline to 10,000 Wikipedia paragraphs to create 30,000 high-quality conversations 10,000 per dialect{---}at modest human cost. To validate the benchmark, we LoRA-fine-tune six open LLMs (1{--}24 B parameters) on Shawarma Chats and observe consistent gains in automatic-grader scores, BERTScore, BLEU and ROUGE particularly for models larger than 7 B parameters. Shawarma Chats thus establishes the first large-scale, dialect-aware, content-grounded dialogue benchmark for Arabic."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zeinalipour-etal-2025-shawarma">
<titleInfo>
<title>Shawarma Chats: A Benchmark Exact Dialogue & Evaluation Platter in Egyptian, Maghrebi & Modern Standard Arabic—A Triple-Dialect Feast for Hungry Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kamyar</namePart>
<namePart type="family">Zeinalipour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="given">Zaky</namePart>
<namePart type="family">Saad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oumaima</namePart>
<namePart type="family">Attafi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Maggini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Gori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Third Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharefah</namePart>
<namePart type="family">Al-Ghamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakhar</namePart>
<namePart type="family">Alkhereyf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Areeb</namePart>
<namePart type="family">Alowisheq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waad</namePart>
<namePart type="family">Alshammari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-352-4</identifier>
</relatedItem>
<abstract>Content-grounded dialogue evaluation for Arabic remains under-resourced, particularly across Modern Standard (MSA), Egyptian, and Maghrebi varieties. We introduce Shawarma Chats, a benchmark of 30,000 six-turn conversations grounded in Wikipedia content, evenly split across the three dialects. To build this corpus, we prompt five frontier LLMs GPT-4o, Gemini 2.5 Flash, Qwen-Plus, DeepSeek-Chat, and Mistral Large to generate 1,500 seed dialogues. Native Arabic speakers evaluate these outputs to select the most effective generator and most human-aligned grader. Sub-A dialogues undergo a two-pass, rationale-driven self-repair loop where the grader critiques and the generator revises; unresolved cases are manually corrected. We apply this pipeline to 10,000 Wikipedia paragraphs to create 30,000 high-quality conversations 10,000 per dialect—at modest human cost. To validate the benchmark, we LoRA-fine-tune six open LLMs (1–24 B parameters) on Shawarma Chats and observe consistent gains in automatic-grader scores, BERTScore, BLEU and ROUGE particularly for models larger than 7 B parameters. Shawarma Chats thus establishes the first large-scale, dialect-aware, content-grounded dialogue benchmark for Arabic.</abstract>
<identifier type="citekey">zeinalipour-etal-2025-shawarma</identifier>
<location>
<url>https://aclanthology.org/2025.arabicnlp-main.39/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>472</start>
<end>524</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shawarma Chats: A Benchmark Exact Dialogue & Evaluation Platter in Egyptian, Maghrebi & Modern Standard Arabic—A Triple-Dialect Feast for Hungry Language Models
%A Zeinalipour, Kamyar
%A Saad, Mohamed Zaky
%A Attafi, Oumaima
%A Maggini, Marco
%A Gori, Marco
%Y Darwish, Kareem
%Y Ali, Ahmed
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%Y Zitouni, Imed
%Y Abdelali, Ahmed
%Y Al-Ghamdi, Sharefah
%Y Alkhereyf, Sakhar
%Y Zaghouani, Wajdi
%Y Khalifa, Salam
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Hamed, Injy
%Y Alyafeai, Zaid
%Y Alowisheq, Areeb
%Y Inoue, Go
%Y Mrini, Khalil
%Y Alshammari, Waad
%S Proceedings of The Third Arabic Natural Language Processing Conference
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-352-4
%F zeinalipour-etal-2025-shawarma
%X Content-grounded dialogue evaluation for Arabic remains under-resourced, particularly across Modern Standard (MSA), Egyptian, and Maghrebi varieties. We introduce Shawarma Chats, a benchmark of 30,000 six-turn conversations grounded in Wikipedia content, evenly split across the three dialects. To build this corpus, we prompt five frontier LLMs GPT-4o, Gemini 2.5 Flash, Qwen-Plus, DeepSeek-Chat, and Mistral Large to generate 1,500 seed dialogues. Native Arabic speakers evaluate these outputs to select the most effective generator and most human-aligned grader. Sub-A dialogues undergo a two-pass, rationale-driven self-repair loop where the grader critiques and the generator revises; unresolved cases are manually corrected. We apply this pipeline to 10,000 Wikipedia paragraphs to create 30,000 high-quality conversations 10,000 per dialect—at modest human cost. To validate the benchmark, we LoRA-fine-tune six open LLMs (1–24 B parameters) on Shawarma Chats and observe consistent gains in automatic-grader scores, BERTScore, BLEU and ROUGE particularly for models larger than 7 B parameters. Shawarma Chats thus establishes the first large-scale, dialect-aware, content-grounded dialogue benchmark for Arabic.
%U https://aclanthology.org/2025.arabicnlp-main.39/
%P 472-524
Markdown (Informal)
[Shawarma Chats: A Benchmark Exact Dialogue & Evaluation Platter in Egyptian, Maghrebi & Modern Standard Arabic—A Triple-Dialect Feast for Hungry Language Models](https://aclanthology.org/2025.arabicnlp-main.39/) (Zeinalipour et al., ArabicNLP 2025)
ACL