@inproceedings{ahmed-etal-2022-tafsir,
title = "Tafsir Dataset: A Novel Multi-Task Benchmark for Named Entity Recognition and Topic Modeling in Classical {A}rabic Literature",
author = {Ahmed, Sajawel and
van der Goot, Rob and
Rehman, Misbahur and
Kruse, Carl and
{\"O}zsoy, {\"O}mer and
Mehler, Alexander and
Roig, Gemma},
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.330",
pages = "3753--3768",
abstract = "Various historical languages, which used to be lingua franca of science and arts, deserve the attention of current NLP research. In this work, we take the first data-driven steps towards this research line for Classical Arabic (CA) by addressing named entity recognition (NER) and topic modeling (TM) on the example of CA literature. We manually annotate the encyclopedic work of Tafsir Al-Tabari with span-based NEs, sentence-based topics, and span-based subtopics, thus creating the Tafsir Dataset with over 51,000 sentences, the first large-scale multi-task benchmark for CA. Next, we analyze our newly generated dataset, which we make open-source available, with current language models (lightweight BiLSTM, transformer-based MaChAmP) along a novel script compression method, thereby achieving state-of-the-art performance for our target task CA-NER. We also show that CA-TM from the perspective of historical topic models, which are central to Arabic studies, is very challenging. With this interdisciplinary work, we lay the foundations for future research on automatic analysis of CA literature.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmed-etal-2022-tafsir">
<titleInfo>
<title>Tafsir Dataset: A Novel Multi-Task Benchmark for Named Entity Recognition and Topic Modeling in Classical Arabic Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sajawel</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">van der Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Misbahur</namePart>
<namePart type="family">Rehman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carl</namePart>
<namePart type="family">Kruse</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ömer</namePart>
<namePart type="family">Özsoy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Mehler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gemma</namePart>
<namePart type="family">Roig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Various historical languages, which used to be lingua franca of science and arts, deserve the attention of current NLP research. In this work, we take the first data-driven steps towards this research line for Classical Arabic (CA) by addressing named entity recognition (NER) and topic modeling (TM) on the example of CA literature. We manually annotate the encyclopedic work of Tafsir Al-Tabari with span-based NEs, sentence-based topics, and span-based subtopics, thus creating the Tafsir Dataset with over 51,000 sentences, the first large-scale multi-task benchmark for CA. Next, we analyze our newly generated dataset, which we make open-source available, with current language models (lightweight BiLSTM, transformer-based MaChAmP) along a novel script compression method, thereby achieving state-of-the-art performance for our target task CA-NER. We also show that CA-TM from the perspective of historical topic models, which are central to Arabic studies, is very challenging. With this interdisciplinary work, we lay the foundations for future research on automatic analysis of CA literature.</abstract>
<identifier type="citekey">ahmed-etal-2022-tafsir</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.330</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>3753</start>
<end>3768</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tafsir Dataset: A Novel Multi-Task Benchmark for Named Entity Recognition and Topic Modeling in Classical Arabic Literature
%A Ahmed, Sajawel
%A van der Goot, Rob
%A Rehman, Misbahur
%A Kruse, Carl
%A Özsoy, Ömer
%A Mehler, Alexander
%A Roig, Gemma
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F ahmed-etal-2022-tafsir
%X Various historical languages, which used to be lingua franca of science and arts, deserve the attention of current NLP research. In this work, we take the first data-driven steps towards this research line for Classical Arabic (CA) by addressing named entity recognition (NER) and topic modeling (TM) on the example of CA literature. We manually annotate the encyclopedic work of Tafsir Al-Tabari with span-based NEs, sentence-based topics, and span-based subtopics, thus creating the Tafsir Dataset with over 51,000 sentences, the first large-scale multi-task benchmark for CA. Next, we analyze our newly generated dataset, which we make open-source available, with current language models (lightweight BiLSTM, transformer-based MaChAmP) along a novel script compression method, thereby achieving state-of-the-art performance for our target task CA-NER. We also show that CA-TM from the perspective of historical topic models, which are central to Arabic studies, is very challenging. With this interdisciplinary work, we lay the foundations for future research on automatic analysis of CA literature.
%U https://aclanthology.org/2022.coling-1.330
%P 3753-3768
Markdown (Informal)
[Tafsir Dataset: A Novel Multi-Task Benchmark for Named Entity Recognition and Topic Modeling in Classical Arabic Literature](https://aclanthology.org/2022.coling-1.330) (Ahmed et al., COLING 2022)
ACL