@inproceedings{zeroual-etal-2019-osian,
title = "{OSIAN}: Open Source International {A}rabic News Corpus - Preparation and Integration into the {CLARIN}-infrastructure",
author = "Zeroual, Imad and
Goldhahn, Dirk and
Eckart, Thomas and
Lakhouaja, Abdelhak",
editor = "El-Hajj, Wassim and
Belguith, Lamia Hadrich and
Bougares, Fethi and
Magdy, Walid and
Zitouni, Imed and
Tomeh, Nadi and
El-Haj, Mahmoud and
Zaghouani, Wajdi",
booktitle = "Proceedings of the Fourth Arabic Natural Language Processing Workshop",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-4619",
doi = "10.18653/v1/W19-4619",
pages = "175--182",
abstract = "The World Wide Web has become a fundamental resource for building large text corpora. Broadcasting platforms such as news websites are rich sources of data regarding diverse topics and form a valuable foundation for research. The Arabic language is extensively utilized on the Web. Still, Arabic is relatively an under-resourced language in terms of availability of freely annotated corpora. This paper presents the first version of the Open Source International Arabic News (OSIAN) corpus. The corpus data was collected from international Arabic news websites, all being freely available on the Web. The corpus consists of about 3.5 million articles comprising more than 37 million sentences and roughly 1 billion tokens. It is encoded in XML; each article is annotated with metadata information. Moreover, each word is annotated with lemma and part-of-speech. the described corpus is processed, archived and published into the CLARIN infrastructure. This publication includes descriptive metadata via OAI-PMH, direct access to the plain text material (available under Creative Commons Attribution-Non-Commercial 4.0 International License - CC BY-NC 4.0), and integration into the WebLicht annotation platform and CLARIN{'}s Federated Content Search FCS.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zeroual-etal-2019-osian">
<titleInfo>
<title>OSIAN: Open Source International Arabic News Corpus - Preparation and Integration into the CLARIN-infrastructure</title>
</titleInfo>
<name type="personal">
<namePart type="given">Imad</namePart>
<namePart type="family">Zeroual</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Goldhahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Eckart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelhak</namePart>
<namePart type="family">Lakhouaja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Arabic Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wassim</namePart>
<namePart type="family">El-Hajj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lamia</namePart>
<namePart type="given">Hadrich</namePart>
<namePart type="family">Belguith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fethi</namePart>
<namePart type="family">Bougares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Magdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The World Wide Web has become a fundamental resource for building large text corpora. Broadcasting platforms such as news websites are rich sources of data regarding diverse topics and form a valuable foundation for research. The Arabic language is extensively utilized on the Web. Still, Arabic is relatively an under-resourced language in terms of availability of freely annotated corpora. This paper presents the first version of the Open Source International Arabic News (OSIAN) corpus. The corpus data was collected from international Arabic news websites, all being freely available on the Web. The corpus consists of about 3.5 million articles comprising more than 37 million sentences and roughly 1 billion tokens. It is encoded in XML; each article is annotated with metadata information. Moreover, each word is annotated with lemma and part-of-speech. the described corpus is processed, archived and published into the CLARIN infrastructure. This publication includes descriptive metadata via OAI-PMH, direct access to the plain text material (available under Creative Commons Attribution-Non-Commercial 4.0 International License - CC BY-NC 4.0), and integration into the WebLicht annotation platform and CLARIN’s Federated Content Search FCS.</abstract>
<identifier type="citekey">zeroual-etal-2019-osian</identifier>
<identifier type="doi">10.18653/v1/W19-4619</identifier>
<location>
<url>https://aclanthology.org/W19-4619</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>175</start>
<end>182</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OSIAN: Open Source International Arabic News Corpus - Preparation and Integration into the CLARIN-infrastructure
%A Zeroual, Imad
%A Goldhahn, Dirk
%A Eckart, Thomas
%A Lakhouaja, Abdelhak
%Y El-Hajj, Wassim
%Y Belguith, Lamia Hadrich
%Y Bougares, Fethi
%Y Magdy, Walid
%Y Zitouni, Imed
%Y Tomeh, Nadi
%Y El-Haj, Mahmoud
%Y Zaghouani, Wajdi
%S Proceedings of the Fourth Arabic Natural Language Processing Workshop
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F zeroual-etal-2019-osian
%X The World Wide Web has become a fundamental resource for building large text corpora. Broadcasting platforms such as news websites are rich sources of data regarding diverse topics and form a valuable foundation for research. The Arabic language is extensively utilized on the Web. Still, Arabic is relatively an under-resourced language in terms of availability of freely annotated corpora. This paper presents the first version of the Open Source International Arabic News (OSIAN) corpus. The corpus data was collected from international Arabic news websites, all being freely available on the Web. The corpus consists of about 3.5 million articles comprising more than 37 million sentences and roughly 1 billion tokens. It is encoded in XML; each article is annotated with metadata information. Moreover, each word is annotated with lemma and part-of-speech. the described corpus is processed, archived and published into the CLARIN infrastructure. This publication includes descriptive metadata via OAI-PMH, direct access to the plain text material (available under Creative Commons Attribution-Non-Commercial 4.0 International License - CC BY-NC 4.0), and integration into the WebLicht annotation platform and CLARIN’s Federated Content Search FCS.
%R 10.18653/v1/W19-4619
%U https://aclanthology.org/W19-4619
%U https://doi.org/10.18653/v1/W19-4619
%P 175-182
Markdown (Informal)
[OSIAN: Open Source International Arabic News Corpus - Preparation and Integration into the CLARIN-infrastructure](https://aclanthology.org/W19-4619) (Zeroual et al., WANLP 2019)
ACL