@inproceedings{indig-etal-2020-elte,
title = "The {ELTE}.{DH} Pilot Corpus {--} Creating a Handcrafted {G}igaword Web Corpus with Metadata",
author = {Indig, Bal{\'a}zs and
Knap, {\'A}rp{\'a}d and
S{\'a}rk{\"o}zi-Lindner, Zs{\'o}fia and
Tim{\'a}ri, M{\'a}ria and
Palk{\'o}, G{\'a}bor},
editor = {Barbaresi, Adrien and
Bildhauer, Felix and
Sch{\"a}fer, Roland and
Stemle, Egon},
booktitle = "Proceedings of the 12th Web as Corpus Workshop",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.wac-1.5",
pages = "33--41",
abstract = "In this article, we present the method we used to create a middle-sized corpus using targeted web crawling. Our corpus contains news portal articles along with their metadata, that can be useful for diverse audiences, ranging from digital humanists to NLP users. The method presented in this paper applies rule-based components that allow the curation of the text and the metadata content. The curated data can thereon serve as a reference for various tasks and measurements. We designed our workflow to encourage modification and customisation. Our concept can also be applied to other genres of portals by using the discovered patterns in the architecture of the portals. We found that for a systematic creation or extension of a similar corpus, our method provides superior accuracy and ease of use compared to The Wayback Machine, while requiring minimal manpower and computational resources. Reproducing the corpus is possible if changes are introduced to the text-extraction process. The standard TEI format and Schema.org encoded metadata is used for the output format, but we stress that placing the corpus in a digital repository system is recommended in order to be able to define semantic relations between the segments and to add rich annotation.",
language = "English",
ISBN = "979-10-95546-68-9",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="indig-etal-2020-elte">
<titleInfo>
<title>The ELTE.DH Pilot Corpus – Creating a Handcrafted Gigaword Web Corpus with Metadata</title>
</titleInfo>
<name type="personal">
<namePart type="given">Balázs</namePart>
<namePart type="family">Indig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Árpád</namePart>
<namePart type="family">Knap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zsófia</namePart>
<namePart type="family">Sárközi-Lindner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mária</namePart>
<namePart type="family">Timári</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gábor</namePart>
<namePart type="family">Palkó</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Web as Corpus Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Bildhauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roland</namePart>
<namePart type="family">Schäfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Egon</namePart>
<namePart type="family">Stemle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-68-9</identifier>
</relatedItem>
<abstract>In this article, we present the method we used to create a middle-sized corpus using targeted web crawling. Our corpus contains news portal articles along with their metadata, that can be useful for diverse audiences, ranging from digital humanists to NLP users. The method presented in this paper applies rule-based components that allow the curation of the text and the metadata content. The curated data can thereon serve as a reference for various tasks and measurements. We designed our workflow to encourage modification and customisation. Our concept can also be applied to other genres of portals by using the discovered patterns in the architecture of the portals. We found that for a systematic creation or extension of a similar corpus, our method provides superior accuracy and ease of use compared to The Wayback Machine, while requiring minimal manpower and computational resources. Reproducing the corpus is possible if changes are introduced to the text-extraction process. The standard TEI format and Schema.org encoded metadata is used for the output format, but we stress that placing the corpus in a digital repository system is recommended in order to be able to define semantic relations between the segments and to add rich annotation.</abstract>
<identifier type="citekey">indig-etal-2020-elte</identifier>
<location>
<url>https://aclanthology.org/2020.wac-1.5</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>33</start>
<end>41</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The ELTE.DH Pilot Corpus – Creating a Handcrafted Gigaword Web Corpus with Metadata
%A Indig, Balázs
%A Knap, Árpád
%A Sárközi-Lindner, Zsófia
%A Timári, Mária
%A Palkó, Gábor
%Y Barbaresi, Adrien
%Y Bildhauer, Felix
%Y Schäfer, Roland
%Y Stemle, Egon
%S Proceedings of the 12th Web as Corpus Workshop
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-68-9
%G English
%F indig-etal-2020-elte
%X In this article, we present the method we used to create a middle-sized corpus using targeted web crawling. Our corpus contains news portal articles along with their metadata, that can be useful for diverse audiences, ranging from digital humanists to NLP users. The method presented in this paper applies rule-based components that allow the curation of the text and the metadata content. The curated data can thereon serve as a reference for various tasks and measurements. We designed our workflow to encourage modification and customisation. Our concept can also be applied to other genres of portals by using the discovered patterns in the architecture of the portals. We found that for a systematic creation or extension of a similar corpus, our method provides superior accuracy and ease of use compared to The Wayback Machine, while requiring minimal manpower and computational resources. Reproducing the corpus is possible if changes are introduced to the text-extraction process. The standard TEI format and Schema.org encoded metadata is used for the output format, but we stress that placing the corpus in a digital repository system is recommended in order to be able to define semantic relations between the segments and to add rich annotation.
%U https://aclanthology.org/2020.wac-1.5
%P 33-41
Markdown (Informal)
[The ELTE.DH Pilot Corpus – Creating a Handcrafted Gigaword Web Corpus with Metadata](https://aclanthology.org/2020.wac-1.5) (Indig et al., WAC 2020)
ACL