@inproceedings{charfi-etal-2019-fine,
title = "A Fine-Grained Annotated Multi-Dialectal {A}rabic Corpus",
author = "Charfi, Anis and
Zaghouani, Wajdi and
Mehdi, Syed Hassan and
Mohamed, Esraa",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)",
month = sep,
year = "2019",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/R19-1023",
doi = "10.26615/978-954-452-056-4_023",
pages = "198--204",
abstract = "We present ARAP-Tweet 2.0, a corpus of 5 million dialectal Arabic tweets and 50 million words of about 3000 Twitter users from 17 Arab countries. Compared to the first version, the new corpus has significant improvements in terms of the data volume and the annotation quality. It is fully balanced with respect to dialect, gender, and three age groups: under 25 years, between 25 and 34, and 35 years and above. This paper describes the process of creating the corpus starting from gathering the dialectal phrases to find the users, to annotating their accounts and retrieving their tweets. We also report on the evaluation of the annotation quality using the inter-annotator agreement measures which were applied to the whole corpus and not just a subset. The obtained results were substantial with average Cohen{'}s Kappa values of 0.99, 0.92, and 0.88 for the annotation of gender, dialect, and age respectively. We also discuss some challenges encountered when developing this corpus.s.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="charfi-etal-2019-fine">
<titleInfo>
<title>A Fine-Grained Annotated Multi-Dialectal Arabic Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anis</namePart>
<namePart type="family">Charfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syed</namePart>
<namePart type="given">Hassan</namePart>
<namePart type="family">Mehdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esraa</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present ARAP-Tweet 2.0, a corpus of 5 million dialectal Arabic tweets and 50 million words of about 3000 Twitter users from 17 Arab countries. Compared to the first version, the new corpus has significant improvements in terms of the data volume and the annotation quality. It is fully balanced with respect to dialect, gender, and three age groups: under 25 years, between 25 and 34, and 35 years and above. This paper describes the process of creating the corpus starting from gathering the dialectal phrases to find the users, to annotating their accounts and retrieving their tweets. We also report on the evaluation of the annotation quality using the inter-annotator agreement measures which were applied to the whole corpus and not just a subset. The obtained results were substantial with average Cohen’s Kappa values of 0.99, 0.92, and 0.88 for the annotation of gender, dialect, and age respectively. We also discuss some challenges encountered when developing this corpus.s.</abstract>
<identifier type="citekey">charfi-etal-2019-fine</identifier>
<identifier type="doi">10.26615/978-954-452-056-4_023</identifier>
<location>
<url>https://aclanthology.org/R19-1023</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>198</start>
<end>204</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Fine-Grained Annotated Multi-Dialectal Arabic Corpus
%A Charfi, Anis
%A Zaghouani, Wajdi
%A Mehdi, Syed Hassan
%A Mohamed, Esraa
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019)
%D 2019
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F charfi-etal-2019-fine
%X We present ARAP-Tweet 2.0, a corpus of 5 million dialectal Arabic tweets and 50 million words of about 3000 Twitter users from 17 Arab countries. Compared to the first version, the new corpus has significant improvements in terms of the data volume and the annotation quality. It is fully balanced with respect to dialect, gender, and three age groups: under 25 years, between 25 and 34, and 35 years and above. This paper describes the process of creating the corpus starting from gathering the dialectal phrases to find the users, to annotating their accounts and retrieving their tweets. We also report on the evaluation of the annotation quality using the inter-annotator agreement measures which were applied to the whole corpus and not just a subset. The obtained results were substantial with average Cohen’s Kappa values of 0.99, 0.92, and 0.88 for the annotation of gender, dialect, and age respectively. We also discuss some challenges encountered when developing this corpus.s.
%R 10.26615/978-954-452-056-4_023
%U https://aclanthology.org/R19-1023
%U https://doi.org/10.26615/978-954-452-056-4_023
%P 198-204
Markdown (Informal)
[A Fine-Grained Annotated Multi-Dialectal Arabic Corpus](https://aclanthology.org/R19-1023) (Charfi et al., RANLP 2019)
ACL
- Anis Charfi, Wajdi Zaghouani, Syed Hassan Mehdi, and Esraa Mohamed. 2019. A Fine-Grained Annotated Multi-Dialectal Arabic Corpus. In Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2019), pages 198–204, Varna, Bulgaria. INCOMA Ltd..