@inproceedings{al-haff-etal-2022-curras,
title = "Curras + Baladi: Towards a {L}evantine Corpus",
author = "Al-Haff, Karim and
Jarrar, Mustafa and
Hammouda, Tymaa and
Zaraket, Fadi",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.82",
pages = "769--778",
abstract = "This paper presents two-fold contributions: a full revision of the Palestinian morphologically annotated corpus (Curras), and a newly annotated Lebanese corpus (Baladi). Both corpora can be used as a more general Levantine corpus. Baladi consists of around 9.6K morphologically annotated tokens. Each token was manually annotated with several morphological features and using LDC{'}s SAMA lemmas and tags. The inter-annotator evaluation on most features illustrates 78.5{\%} Kappa and 90.1{\%} F1-Score. Curras was revised by refining all annotations for accuracy, normalization and unification of POS tags, and linking with SAMA lemmas. This revision was also important to ensure that both corpora are compatible and can help to bridge the nuanced linguistic gaps that exist between the two highly mutually intelligible dialects. Both corpora are publicly available through a web portal.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="al-haff-etal-2022-curras">
<titleInfo>
<title>Curras + Baladi: Towards a Levantine Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karim</namePart>
<namePart type="family">Al-Haff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="family">Jarrar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tymaa</namePart>
<namePart type="family">Hammouda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fadi</namePart>
<namePart type="family">Zaraket</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents two-fold contributions: a full revision of the Palestinian morphologically annotated corpus (Curras), and a newly annotated Lebanese corpus (Baladi). Both corpora can be used as a more general Levantine corpus. Baladi consists of around 9.6K morphologically annotated tokens. Each token was manually annotated with several morphological features and using LDC’s SAMA lemmas and tags. The inter-annotator evaluation on most features illustrates 78.5% Kappa and 90.1% F1-Score. Curras was revised by refining all annotations for accuracy, normalization and unification of POS tags, and linking with SAMA lemmas. This revision was also important to ensure that both corpora are compatible and can help to bridge the nuanced linguistic gaps that exist between the two highly mutually intelligible dialects. Both corpora are publicly available through a web portal.</abstract>
<identifier type="citekey">al-haff-etal-2022-curras</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.82</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>769</start>
<end>778</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Curras + Baladi: Towards a Levantine Corpus
%A Al-Haff, Karim
%A Jarrar, Mustafa
%A Hammouda, Tymaa
%A Zaraket, Fadi
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F al-haff-etal-2022-curras
%X This paper presents two-fold contributions: a full revision of the Palestinian morphologically annotated corpus (Curras), and a newly annotated Lebanese corpus (Baladi). Both corpora can be used as a more general Levantine corpus. Baladi consists of around 9.6K morphologically annotated tokens. Each token was manually annotated with several morphological features and using LDC’s SAMA lemmas and tags. The inter-annotator evaluation on most features illustrates 78.5% Kappa and 90.1% F1-Score. Curras was revised by refining all annotations for accuracy, normalization and unification of POS tags, and linking with SAMA lemmas. This revision was also important to ensure that both corpora are compatible and can help to bridge the nuanced linguistic gaps that exist between the two highly mutually intelligible dialects. Both corpora are publicly available through a web portal.
%U https://aclanthology.org/2022.lrec-1.82
%P 769-778
Markdown (Informal)
[Curras + Baladi: Towards a Levantine Corpus](https://aclanthology.org/2022.lrec-1.82) (Al-Haff et al., LREC 2022)
ACL
- Karim Al-Haff, Mustafa Jarrar, Tymaa Hammouda, and Fadi Zaraket. 2022. Curras + Baladi: Towards a Levantine Corpus. In Proceedings of the Thirteenth Language Resources and Evaluation Conference, pages 769–778, Marseille, France. European Language Resources Association.