@inproceedings{el-ghawi-2025-web,
title = "Web-Based Corpus Compilation of the Emirati {A}rabic Dialect",
author = "El-Ghawi, Yousra A.",
editor = "Ezzini, Saad and
Alami, Hamza and
Berrada, Ismail and
Benlahbib, Abdessamad and
El Mahdaouy, Abdelkader and
Lamsiyah, Salima and
Derrouz, Hatim and
Haddad Haddad, Amal and
Jarrar, Mustafa and
El-Haj, Mo and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wacl-1.7/",
pages = "63--67",
abstract = "This paper displays some initial efforts conducted in the compilation pursuits of Arabic dialectal corpora in the form of raw text, the end purpose of which is to fine-tune existing Arabic large language models (LLM) to better understand and generate text in the Emirati dialect as instructed. The focus of the paper is on the process of compiling corpora from the web, which includes the exploration of possible methods, tools and techniques specific to web search, as well as examples of genres and domains to explore. The results of these efforts and the importance of native speaker contributions to corpus compilation for low-resource languages are also touched upon."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="el-ghawi-2025-web">
<titleInfo>
<title>Web-Based Corpus Compilation of the Emirati Arabic Dialect</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yousra</namePart>
<namePart type="given">A</namePart>
<namePart type="family">El-Ghawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Ezzini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamza</namePart>
<namePart type="family">Alami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ismail</namePart>
<namePart type="family">Berrada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdessamad</namePart>
<namePart type="family">Benlahbib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelkader</namePart>
<namePart type="family">El Mahdaouy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salima</namePart>
<namePart type="family">Lamsiyah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatim</namePart>
<namePart type="family">Derrouz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amal</namePart>
<namePart type="family">Haddad Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper displays some initial efforts conducted in the compilation pursuits of Arabic dialectal corpora in the form of raw text, the end purpose of which is to fine-tune existing Arabic large language models (LLM) to better understand and generate text in the Emirati dialect as instructed. The focus of the paper is on the process of compiling corpora from the web, which includes the exploration of possible methods, tools and techniques specific to web search, as well as examples of genres and domains to explore. The results of these efforts and the importance of native speaker contributions to corpus compilation for low-resource languages are also touched upon.</abstract>
<identifier type="citekey">el-ghawi-2025-web</identifier>
<location>
<url>https://aclanthology.org/2025.wacl-1.7/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>63</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Web-Based Corpus Compilation of the Emirati Arabic Dialect
%A El-Ghawi, Yousra A.
%Y Ezzini, Saad
%Y Alami, Hamza
%Y Berrada, Ismail
%Y Benlahbib, Abdessamad
%Y El Mahdaouy, Abdelkader
%Y Lamsiyah, Salima
%Y Derrouz, Hatim
%Y Haddad Haddad, Amal
%Y Jarrar, Mustafa
%Y El-Haj, Mo
%Y Mitkov, Ruslan
%Y Rayson, Paul
%S Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F el-ghawi-2025-web
%X This paper displays some initial efforts conducted in the compilation pursuits of Arabic dialectal corpora in the form of raw text, the end purpose of which is to fine-tune existing Arabic large language models (LLM) to better understand and generate text in the Emirati dialect as instructed. The focus of the paper is on the process of compiling corpora from the web, which includes the exploration of possible methods, tools and techniques specific to web search, as well as examples of genres and domains to explore. The results of these efforts and the importance of native speaker contributions to corpus compilation for low-resource languages are also touched upon.
%U https://aclanthology.org/2025.wacl-1.7/
%P 63-67
Markdown (Informal)
[Web-Based Corpus Compilation of the Emirati Arabic Dialect](https://aclanthology.org/2025.wacl-1.7/) (El-Ghawi, WACL 2025)
ACL