@inproceedings{petasis-petasis-2010-blogbuster,
title = "{B}log{B}uster: A Tool for Extracting Corpora from the Blogosphere",
author = "Petasis, Georgios and
Petasis, Dimitrios",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/808_Paper.pdf",
abstract = "This paper presents BlogBuster, a tool for extracting a corpus from the blogosphere. The topic of cleaning arbitrary web pages with the goal of extracting a corpus from web data, suitable for linguistic and language technology research and development, has attracted significant research interest recently. Several general purpose approaches for removing boilerplate have been presented in the literature; however the blogosphere poses additional requirements, such as a finer control over the extracted textual segments in order to accurately identify important elements, i.e. individual blog posts, titles, posting dates or comments. BlogBuster tries to provide such additional details along with boilerplate removal, following a rule-based approach. A small set of rules were manually constructed by observing a limited set of blogs from the Blogger and Wordpress hosting platforms. These rules operate on the DOM tree of an HTML page, as constructed by a popular browser, Mozilla Firefox. Evaluation results suggest that BlogBuster is very accurate when extracting corpora from blogs hosted in the Blogger and Wordpress, while exhibiting a reasonable precision when applied to blogs not hosted in these two popular blogging platforms.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="petasis-petasis-2010-blogbuster">
<titleInfo>
<title>BlogBuster: A Tool for Extracting Corpora from the Blogosphere</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Petasis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitrios</namePart>
<namePart type="family">Petasis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents BlogBuster, a tool for extracting a corpus from the blogosphere. The topic of cleaning arbitrary web pages with the goal of extracting a corpus from web data, suitable for linguistic and language technology research and development, has attracted significant research interest recently. Several general purpose approaches for removing boilerplate have been presented in the literature; however the blogosphere poses additional requirements, such as a finer control over the extracted textual segments in order to accurately identify important elements, i.e. individual blog posts, titles, posting dates or comments. BlogBuster tries to provide such additional details along with boilerplate removal, following a rule-based approach. A small set of rules were manually constructed by observing a limited set of blogs from the Blogger and Wordpress hosting platforms. These rules operate on the DOM tree of an HTML page, as constructed by a popular browser, Mozilla Firefox. Evaluation results suggest that BlogBuster is very accurate when extracting corpora from blogs hosted in the Blogger and Wordpress, while exhibiting a reasonable precision when applied to blogs not hosted in these two popular blogging platforms.</abstract>
<identifier type="citekey">petasis-petasis-2010-blogbuster</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/808_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BlogBuster: A Tool for Extracting Corpora from the Blogosphere
%A Petasis, Georgios
%A Petasis, Dimitrios
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F petasis-petasis-2010-blogbuster
%X This paper presents BlogBuster, a tool for extracting a corpus from the blogosphere. The topic of cleaning arbitrary web pages with the goal of extracting a corpus from web data, suitable for linguistic and language technology research and development, has attracted significant research interest recently. Several general purpose approaches for removing boilerplate have been presented in the literature; however the blogosphere poses additional requirements, such as a finer control over the extracted textual segments in order to accurately identify important elements, i.e. individual blog posts, titles, posting dates or comments. BlogBuster tries to provide such additional details along with boilerplate removal, following a rule-based approach. A small set of rules were manually constructed by observing a limited set of blogs from the Blogger and Wordpress hosting platforms. These rules operate on the DOM tree of an HTML page, as constructed by a popular browser, Mozilla Firefox. Evaluation results suggest that BlogBuster is very accurate when extracting corpora from blogs hosted in the Blogger and Wordpress, while exhibiting a reasonable precision when applied to blogs not hosted in these two popular blogging platforms.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/808_Paper.pdf
Markdown (Informal)
[BlogBuster: A Tool for Extracting Corpora from the Blogosphere](http://www.lrec-conf.org/proceedings/lrec2010/pdf/808_Paper.pdf) (Petasis & Petasis, LREC 2010)
ACL