@inproceedings{roziewski-stokowiec-2016-languagecrawl,
title = "{L}anguage{C}rawl: A Generic Tool for Building Language Models Upon {C}ommon-{C}rawl",
author = "Roziewski, Szymon and
Stokowiec, Wojciech",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1443",
pages = "2789--2793",
abstract = "The web data contains immense amount of data, hundreds of billion words are waiting to be extracted and used for language research. In this work we introduce our tool LanguageCrawl which allows NLP researchers to easily construct web-scale corpus from Common Crawl Archive: a petabyte scale, open repository of web crawl information. Three use-cases are presented: filtering Polish websites, building an N-gram corpora and training continuous skip-gram language model with hierarchical softmax. Each of them has been implemented within the LanguageCrawl toolkit, with the possibility to adjust specified language and N-gram ranks. Special effort has been put on high computing efficiency, by applying highly concurrent multitasking. We make our tool publicly available to enrich NLP resources. We strongly believe that our work will help to facilitate NLP research, especially in under-resourced languages, where the lack of appropriately sized corpora is a serious hindrance to applying data-intensive methods, such as deep neural networks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="roziewski-stokowiec-2016-languagecrawl">
<titleInfo>
<title>LanguageCrawl: A Generic Tool for Building Language Models Upon Common-Crawl</title>
</titleInfo>
<name type="personal">
<namePart type="given">Szymon</namePart>
<namePart type="family">Roziewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Stokowiec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The web data contains immense amount of data, hundreds of billion words are waiting to be extracted and used for language research. In this work we introduce our tool LanguageCrawl which allows NLP researchers to easily construct web-scale corpus from Common Crawl Archive: a petabyte scale, open repository of web crawl information. Three use-cases are presented: filtering Polish websites, building an N-gram corpora and training continuous skip-gram language model with hierarchical softmax. Each of them has been implemented within the LanguageCrawl toolkit, with the possibility to adjust specified language and N-gram ranks. Special effort has been put on high computing efficiency, by applying highly concurrent multitasking. We make our tool publicly available to enrich NLP resources. We strongly believe that our work will help to facilitate NLP research, especially in under-resourced languages, where the lack of appropriately sized corpora is a serious hindrance to applying data-intensive methods, such as deep neural networks.</abstract>
<identifier type="citekey">roziewski-stokowiec-2016-languagecrawl</identifier>
<location>
<url>https://aclanthology.org/L16-1443</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>2789</start>
<end>2793</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LanguageCrawl: A Generic Tool for Building Language Models Upon Common-Crawl
%A Roziewski, Szymon
%A Stokowiec, Wojciech
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F roziewski-stokowiec-2016-languagecrawl
%X The web data contains immense amount of data, hundreds of billion words are waiting to be extracted and used for language research. In this work we introduce our tool LanguageCrawl which allows NLP researchers to easily construct web-scale corpus from Common Crawl Archive: a petabyte scale, open repository of web crawl information. Three use-cases are presented: filtering Polish websites, building an N-gram corpora and training continuous skip-gram language model with hierarchical softmax. Each of them has been implemented within the LanguageCrawl toolkit, with the possibility to adjust specified language and N-gram ranks. Special effort has been put on high computing efficiency, by applying highly concurrent multitasking. We make our tool publicly available to enrich NLP resources. We strongly believe that our work will help to facilitate NLP research, especially in under-resourced languages, where the lack of appropriately sized corpora is a serious hindrance to applying data-intensive methods, such as deep neural networks.
%U https://aclanthology.org/L16-1443
%P 2789-2793
Markdown (Informal)
[LanguageCrawl: A Generic Tool for Building Language Models Upon Common-Crawl](https://aclanthology.org/L16-1443) (Roziewski & Stokowiec, LREC 2016)
ACL