@inproceedings{ibrahim-etal-2008-automatic,
title = "Automatic Extraction of Textual Elements from News Web Pages",
author = "Ibrahim, Hossam and
Darwish, Kareem and
Madany, Abdel-Rahim",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2008/pdf/407_paper.pdf",
abstract = "In this paper we present an algorithm for automatic extraction of textual elements, namely titles and full text, associated with news stories in news web pages. We propose a supervised machine learning classification technique based on the use of a Support Vector Machine (SVM) classifier to extract the desired textual elements. The technique uses internal structural features of a webpage without relying on the Document Object Model to which many content authors fail to adhere. The classifier uses a set of features which rely on the length of text, the percentage of hypertext, etc. The resulting classifier is nearly perfect on previously unseen news pages from different sites. The proposed technique is successfully employed in Alzoa.com, which is the largest Arabic news aggregator on the web.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ibrahim-etal-2008-automatic">
<titleInfo>
<title>Automatic Extraction of Textual Elements from News Web Pages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hossam</namePart>
<namePart type="family">Ibrahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdel-Rahim</namePart>
<namePart type="family">Madany</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2008-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marrakech, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we present an algorithm for automatic extraction of textual elements, namely titles and full text, associated with news stories in news web pages. We propose a supervised machine learning classification technique based on the use of a Support Vector Machine (SVM) classifier to extract the desired textual elements. The technique uses internal structural features of a webpage without relying on the Document Object Model to which many content authors fail to adhere. The classifier uses a set of features which rely on the length of text, the percentage of hypertext, etc. The resulting classifier is nearly perfect on previously unseen news pages from different sites. The proposed technique is successfully employed in Alzoa.com, which is the largest Arabic news aggregator on the web.</abstract>
<identifier type="citekey">ibrahim-etal-2008-automatic</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2008/pdf/407_paper.pdf</url>
</location>
<part>
<date>2008-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Extraction of Textual Elements from News Web Pages
%A Ibrahim, Hossam
%A Darwish, Kareem
%A Madany, Abdel-Rahim
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Tapias, Daniel
%S Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)
%D 2008
%8 May
%I European Language Resources Association (ELRA)
%C Marrakech, Morocco
%F ibrahim-etal-2008-automatic
%X In this paper we present an algorithm for automatic extraction of textual elements, namely titles and full text, associated with news stories in news web pages. We propose a supervised machine learning classification technique based on the use of a Support Vector Machine (SVM) classifier to extract the desired textual elements. The technique uses internal structural features of a webpage without relying on the Document Object Model to which many content authors fail to adhere. The classifier uses a set of features which rely on the length of text, the percentage of hypertext, etc. The resulting classifier is nearly perfect on previously unseen news pages from different sites. The proposed technique is successfully employed in Alzoa.com, which is the largest Arabic news aggregator on the web.
%U http://www.lrec-conf.org/proceedings/lrec2008/pdf/407_paper.pdf
Markdown (Informal)
[Automatic Extraction of Textual Elements from News Web Pages](http://www.lrec-conf.org/proceedings/lrec2008/pdf/407_paper.pdf) (Ibrahim et al., LREC 2008)
ACL