@inproceedings{belinkov-etal-2016-shamela,
title = "{S}hamela: A Large-Scale Historical {A}rabic Corpus",
author = "Belinkov, Yonatan and
Magidow, Alexander and
Romanov, Maxim and
Shmidman, Avi and
Koppel, Moshe",
editor = "Hinrichs, Erhard and
Hinrichs, Marie and
Trippel, Thorsten",
booktitle = "Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities ({LT}4{DH})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4007",
pages = "45--53",
abstract = "Arabic is a widely-spoken language with a rich and long history spanning more than fourteen centuries. Yet existing Arabic corpora largely focus on the modern period or lack sufficient diachronic information. We develop a large-scale, historical corpus of Arabic of about 1 billion words from diverse periods of time. We clean this corpus, process it with a morphological analyzer, and enhance it by detecting parallel passages and automatically dating undated texts. We demonstrate its utility with selected case-studies in which we show its application to the digital humanities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="belinkov-etal-2016-shamela">
<titleInfo>
<title>Shamela: A Large-Scale Historical Arabic Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonatan</namePart>
<namePart type="family">Belinkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Magidow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Romanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avi</namePart>
<namePart type="family">Shmidman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moshe</namePart>
<namePart type="family">Koppel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erhard</namePart>
<namePart type="family">Hinrichs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Hinrichs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thorsten</namePart>
<namePart type="family">Trippel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic is a widely-spoken language with a rich and long history spanning more than fourteen centuries. Yet existing Arabic corpora largely focus on the modern period or lack sufficient diachronic information. We develop a large-scale, historical corpus of Arabic of about 1 billion words from diverse periods of time. We clean this corpus, process it with a morphological analyzer, and enhance it by detecting parallel passages and automatically dating undated texts. We demonstrate its utility with selected case-studies in which we show its application to the digital humanities.</abstract>
<identifier type="citekey">belinkov-etal-2016-shamela</identifier>
<location>
<url>https://aclanthology.org/W16-4007</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>45</start>
<end>53</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Shamela: A Large-Scale Historical Arabic Corpus
%A Belinkov, Yonatan
%A Magidow, Alexander
%A Romanov, Maxim
%A Shmidman, Avi
%A Koppel, Moshe
%Y Hinrichs, Erhard
%Y Hinrichs, Marie
%Y Trippel, Thorsten
%S Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F belinkov-etal-2016-shamela
%X Arabic is a widely-spoken language with a rich and long history spanning more than fourteen centuries. Yet existing Arabic corpora largely focus on the modern period or lack sufficient diachronic information. We develop a large-scale, historical corpus of Arabic of about 1 billion words from diverse periods of time. We clean this corpus, process it with a morphological analyzer, and enhance it by detecting parallel passages and automatically dating undated texts. We demonstrate its utility with selected case-studies in which we show its application to the digital humanities.
%U https://aclanthology.org/W16-4007
%P 45-53
Markdown (Informal)
[Shamela: A Large-Scale Historical Arabic Corpus](https://aclanthology.org/W16-4007) (Belinkov et al., LT4DH 2016)
ACL
- Yonatan Belinkov, Alexander Magidow, Maxim Romanov, Avi Shmidman, and Moshe Koppel. 2016. Shamela: A Large-Scale Historical Arabic Corpus. In Proceedings of the Workshop on Language Technology Resources and Tools for Digital Humanities (LT4DH), pages 45–53, Osaka, Japan. The COLING 2016 Organizing Committee.