@inproceedings{alrowili-vijay-shanker-2023-artrivia,
title = "{A}r{T}rivia: Harvesting {A}rabic {W}ikipedia to Build A New {A}rabic Question Answering Dataset",
author = "Alrowili, Sultan and
Vijay-Shanker, K",
editor = "Sawaf, Hassan and
El-Beltagy, Samhaa and
Zaghouani, Wajdi and
Magdy, Walid and
Abdelali, Ahmed and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Habash, Nizar and
Khalifa, Salam and
Keleg, Amr and
Haddad, Hatem and
Zitouni, Imed and
Mrini, Khalil and
Almatham, Rawan",
booktitle = "Proceedings of ArabicNLP 2023",
month = dec,
year = "2023",
address = "Singapore (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.arabicnlp-1.17",
doi = "10.18653/v1/2023.arabicnlp-1.17",
pages = "191--207",
abstract = "We present ArTrivia, a new Arabic question-answering dataset consisting of more than 10,000 question-answer pairs along with relevant passages, covering a wide range of 18 diverse topics in Arabic. We created our dataset using a newly proposed pipeline that leverages diverse structured data sources from Arabic Wikipedia. Moreover, we conducted a comprehensive statistical analysis of ArTrivia and assessed the performance of each component in our pipeline. Additionally, we compared the performance of ArTrivia against the existing TyDi QA dataset using various experimental setups. Our analysis highlights the significance of often overlooked aspects in dataset creation, such as answer normalization, in enhancing the quality of QA datasets. Our evaluation also shows that ArTrivia presents more challenging and out-of-distribution questions to TyDi, raising questions about the feasibility of using ArTrivia as a complementary dataset to TyDi.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alrowili-vijay-shanker-2023-artrivia">
<titleInfo>
<title>ArTrivia: Harvesting Arabic Wikipedia to Build A New Arabic Question Answering Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sultan</namePart>
<namePart type="family">Alrowili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">K</namePart>
<namePart type="family">Vijay-Shanker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of ArabicNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hassan</namePart>
<namePart type="family">Sawaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samhaa</namePart>
<namePart type="family">El-Beltagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Magdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amr</namePart>
<namePart type="family">Keleg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present ArTrivia, a new Arabic question-answering dataset consisting of more than 10,000 question-answer pairs along with relevant passages, covering a wide range of 18 diverse topics in Arabic. We created our dataset using a newly proposed pipeline that leverages diverse structured data sources from Arabic Wikipedia. Moreover, we conducted a comprehensive statistical analysis of ArTrivia and assessed the performance of each component in our pipeline. Additionally, we compared the performance of ArTrivia against the existing TyDi QA dataset using various experimental setups. Our analysis highlights the significance of often overlooked aspects in dataset creation, such as answer normalization, in enhancing the quality of QA datasets. Our evaluation also shows that ArTrivia presents more challenging and out-of-distribution questions to TyDi, raising questions about the feasibility of using ArTrivia as a complementary dataset to TyDi.</abstract>
<identifier type="citekey">alrowili-vijay-shanker-2023-artrivia</identifier>
<identifier type="doi">10.18653/v1/2023.arabicnlp-1.17</identifier>
<location>
<url>https://aclanthology.org/2023.arabicnlp-1.17</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>191</start>
<end>207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ArTrivia: Harvesting Arabic Wikipedia to Build A New Arabic Question Answering Dataset
%A Alrowili, Sultan
%A Vijay-Shanker, K.
%Y Sawaf, Hassan
%Y El-Beltagy, Samhaa
%Y Zaghouani, Wajdi
%Y Magdy, Walid
%Y Abdelali, Ahmed
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Habash, Nizar
%Y Khalifa, Salam
%Y Keleg, Amr
%Y Haddad, Hatem
%Y Zitouni, Imed
%Y Mrini, Khalil
%Y Almatham, Rawan
%S Proceedings of ArabicNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore (Hybrid)
%F alrowili-vijay-shanker-2023-artrivia
%X We present ArTrivia, a new Arabic question-answering dataset consisting of more than 10,000 question-answer pairs along with relevant passages, covering a wide range of 18 diverse topics in Arabic. We created our dataset using a newly proposed pipeline that leverages diverse structured data sources from Arabic Wikipedia. Moreover, we conducted a comprehensive statistical analysis of ArTrivia and assessed the performance of each component in our pipeline. Additionally, we compared the performance of ArTrivia against the existing TyDi QA dataset using various experimental setups. Our analysis highlights the significance of often overlooked aspects in dataset creation, such as answer normalization, in enhancing the quality of QA datasets. Our evaluation also shows that ArTrivia presents more challenging and out-of-distribution questions to TyDi, raising questions about the feasibility of using ArTrivia as a complementary dataset to TyDi.
%R 10.18653/v1/2023.arabicnlp-1.17
%U https://aclanthology.org/2023.arabicnlp-1.17
%U https://doi.org/10.18653/v1/2023.arabicnlp-1.17
%P 191-207
Markdown (Informal)
[ArTrivia: Harvesting Arabic Wikipedia to Build A New Arabic Question Answering Dataset](https://aclanthology.org/2023.arabicnlp-1.17) (Alrowili & Vijay-Shanker, ArabicNLP-WS 2023)
ACL