@inproceedings{kilgarriff-etal-2010-corpus,
title = "A Corpus Factory for Many Languages",
author = "Kilgarriff, Adam and
Reddy, Siva and
Pomik{\'a}lek, Jan and
PVS, Avinesh",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/79_Paper.pdf",
abstract = "For many languages there are no large, general-language corpora available. Until the web, all but the institutions could do little but shake their heads in dismay as corpus-building was long, slow and expensive. But with the advent of the Web it can be highly automated and thereby fast and inexpensive. We have developed a corpus factory where we build large corpora. In this paper we describe the method we use, and how it has worked, and how various problems were solved, for eight languages: Dutch, Hindi, Indonesian, Norwegian, Swedish, Telugu, Thai and Vietnamese. We use the BootCaT method: we take a set of 'seed words' for the language from Wikipedia. Then, several hundred times over, we * randomly select three or four of the seed words * send as a query to Google or Yahoo or Bing, which returns a 'search hits' page * gather the pages that Google or Yahoo point to and save the text. This forms the corpus, which we then * 'clean' (to remove navigation bars, advertisements etc) * remove duplicates * tokenise and (if tools are available) lemmatise and part-of-speech tag * load into our corpus query tool, the Sketch Engine The corpora we have developed are available for use in the Sketch Engine corpus query tool.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kilgarriff-etal-2010-corpus">
<titleInfo>
<title>A Corpus Factory for Many Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Kilgarriff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siva</namePart>
<namePart type="family">Reddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Pomikálek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avinesh</namePart>
<namePart type="family">PVS</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For many languages there are no large, general-language corpora available. Until the web, all but the institutions could do little but shake their heads in dismay as corpus-building was long, slow and expensive. But with the advent of the Web it can be highly automated and thereby fast and inexpensive. We have developed a corpus factory where we build large corpora. In this paper we describe the method we use, and how it has worked, and how various problems were solved, for eight languages: Dutch, Hindi, Indonesian, Norwegian, Swedish, Telugu, Thai and Vietnamese. We use the BootCaT method: we take a set of ’seed words’ for the language from Wikipedia. Then, several hundred times over, we * randomly select three or four of the seed words * send as a query to Google or Yahoo or Bing, which returns a ’search hits’ page * gather the pages that Google or Yahoo point to and save the text. This forms the corpus, which we then * ’clean’ (to remove navigation bars, advertisements etc) * remove duplicates * tokenise and (if tools are available) lemmatise and part-of-speech tag * load into our corpus query tool, the Sketch Engine The corpora we have developed are available for use in the Sketch Engine corpus query tool.</abstract>
<identifier type="citekey">kilgarriff-etal-2010-corpus</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/79_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Corpus Factory for Many Languages
%A Kilgarriff, Adam
%A Reddy, Siva
%A Pomikálek, Jan
%A PVS, Avinesh
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F kilgarriff-etal-2010-corpus
%X For many languages there are no large, general-language corpora available. Until the web, all but the institutions could do little but shake their heads in dismay as corpus-building was long, slow and expensive. But with the advent of the Web it can be highly automated and thereby fast and inexpensive. We have developed a corpus factory where we build large corpora. In this paper we describe the method we use, and how it has worked, and how various problems were solved, for eight languages: Dutch, Hindi, Indonesian, Norwegian, Swedish, Telugu, Thai and Vietnamese. We use the BootCaT method: we take a set of ’seed words’ for the language from Wikipedia. Then, several hundred times over, we * randomly select three or four of the seed words * send as a query to Google or Yahoo or Bing, which returns a ’search hits’ page * gather the pages that Google or Yahoo point to and save the text. This forms the corpus, which we then * ’clean’ (to remove navigation bars, advertisements etc) * remove duplicates * tokenise and (if tools are available) lemmatise and part-of-speech tag * load into our corpus query tool, the Sketch Engine The corpora we have developed are available for use in the Sketch Engine corpus query tool.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/79_Paper.pdf
Markdown (Informal)
[A Corpus Factory for Many Languages](http://www.lrec-conf.org/proceedings/lrec2010/pdf/79_Paper.pdf) (Kilgarriff et al., LREC 2010)
ACL
- Adam Kilgarriff, Siva Reddy, Jan Pomikálek, and Avinesh PVS. 2010. A Corpus Factory for Many Languages. In Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10), Valletta, Malta. European Language Resources Association (ELRA).