@inproceedings{yamani-etal-2024-kind,
title = "The {KIND} Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection",
author = "Yamani, Asma and
Alziyady, Raghad and
AlYami, Reem and
Albelali, Salma and
Albelali, Leina and
Almulhim, Jawharah and
Alsulami, Amjad and
Alfarraj, Motaz and
Al-Zaidy, Rabeah",
editor = "Falk, Neele and
Papi, Sara and
Zhang, Mike",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-srw.3",
pages = "32--43",
abstract = "Nuanced dialects are a linguistic variant that pose several challenges for NLP models and techniques. One of the main challenges is the limited amount of datasets to enable extensive research and experimentation. We propose an approach for efficiently collecting nuanced dialectal datasets that are not only of high quality, but are versatile enough to be multipurpose as well. To test our approach we collect the KIND corpus, which is a collection of fine-grained Arabic dialect data. The data is short texts, and unlike many nuanced dialectal datasets, it is curated manually through social collaboration efforts as opposed to being crawled from social media. The collaborative approach is incentivized through educational gamification and competitions for which the community itself benefits from the open source dataset. Our approach aims to achieve: (1) coverage of dialects from under-represented groups and fine-grained dialectal varieties, (2) provide aligned parallel corpora for translation between Modern Standard Arabic (MSA) and multiple dialects to enable translation and comparison studies, (3) promote innovative approaches for nuanced dialect data collection. We explain the steps for the competition as well as the resulting datasets and the competing data collection systems. The KIND dataset is shared with the research community.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamani-etal-2024-kind">
<titleInfo>
<title>The KIND Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Asma</namePart>
<namePart type="family">Yamani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raghad</namePart>
<namePart type="family">Alziyady</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reem</namePart>
<namePart type="family">AlYami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salma</namePart>
<namePart type="family">Albelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leina</namePart>
<namePart type="family">Albelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jawharah</namePart>
<namePart type="family">Almulhim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amjad</namePart>
<namePart type="family">Alsulami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Motaz</namePart>
<namePart type="family">Alfarraj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rabeah</namePart>
<namePart type="family">Al-Zaidy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neele</namePart>
<namePart type="family">Falk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Papi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Nuanced dialects are a linguistic variant that pose several challenges for NLP models and techniques. One of the main challenges is the limited amount of datasets to enable extensive research and experimentation. We propose an approach for efficiently collecting nuanced dialectal datasets that are not only of high quality, but are versatile enough to be multipurpose as well. To test our approach we collect the KIND corpus, which is a collection of fine-grained Arabic dialect data. The data is short texts, and unlike many nuanced dialectal datasets, it is curated manually through social collaboration efforts as opposed to being crawled from social media. The collaborative approach is incentivized through educational gamification and competitions for which the community itself benefits from the open source dataset. Our approach aims to achieve: (1) coverage of dialects from under-represented groups and fine-grained dialectal varieties, (2) provide aligned parallel corpora for translation between Modern Standard Arabic (MSA) and multiple dialects to enable translation and comparison studies, (3) promote innovative approaches for nuanced dialect data collection. We explain the steps for the competition as well as the resulting datasets and the competing data collection systems. The KIND dataset is shared with the research community.</abstract>
<identifier type="citekey">yamani-etal-2024-kind</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-srw.3</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>32</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The KIND Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection
%A Yamani, Asma
%A Alziyady, Raghad
%A AlYami, Reem
%A Albelali, Salma
%A Albelali, Leina
%A Almulhim, Jawharah
%A Alsulami, Amjad
%A Alfarraj, Motaz
%A Al-Zaidy, Rabeah
%Y Falk, Neele
%Y Papi, Sara
%Y Zhang, Mike
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F yamani-etal-2024-kind
%X Nuanced dialects are a linguistic variant that pose several challenges for NLP models and techniques. One of the main challenges is the limited amount of datasets to enable extensive research and experimentation. We propose an approach for efficiently collecting nuanced dialectal datasets that are not only of high quality, but are versatile enough to be multipurpose as well. To test our approach we collect the KIND corpus, which is a collection of fine-grained Arabic dialect data. The data is short texts, and unlike many nuanced dialectal datasets, it is curated manually through social collaboration efforts as opposed to being crawled from social media. The collaborative approach is incentivized through educational gamification and competitions for which the community itself benefits from the open source dataset. Our approach aims to achieve: (1) coverage of dialects from under-represented groups and fine-grained dialectal varieties, (2) provide aligned parallel corpora for translation between Modern Standard Arabic (MSA) and multiple dialects to enable translation and comparison studies, (3) promote innovative approaches for nuanced dialect data collection. We explain the steps for the competition as well as the resulting datasets and the competing data collection systems. The KIND dataset is shared with the research community.
%U https://aclanthology.org/2024.eacl-srw.3
%P 32-43
Markdown (Informal)
[The KIND Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection](https://aclanthology.org/2024.eacl-srw.3) (Yamani et al., EACL 2024)
ACL
- Asma Yamani, Raghad Alziyady, Reem AlYami, Salma Albelali, Leina Albelali, Jawharah Almulhim, Amjad Alsulami, Motaz Alfarraj, and Rabeah Al-Zaidy. 2024. The KIND Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, pages 32–43, St. Julian’s, Malta. Association for Computational Linguistics.