@inproceedings{mysore-gopinath-etal-2018-supervised,
title = "Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents",
author = "Mysore Gopinath, Abhijith Athreya and
Wilson, Shomir and
Sadeh, Norman",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D18-1099",
doi = "10.18653/v1/D18-1099",
pages = "850--855",
abstract = "The text in many web documents is organized into a hierarchy of section titles and corresponding prose content, a structure which provides potentially exploitable information on discourse structure and topicality. However, this organization is generally discarded during text collection, and collecting it is not straightforward: the same visual organization can be implemented in a myriad of different ways in the underlying HTML. To remedy this, we present a flexible system for automatically extracting the hierarchical section titles and prose organization of web documents irrespective of differences in HTML representation. This system uses features from syntax, semantics, discourse and markup to build two models which classify HTML text into section titles and prose text. When tested on three different domains of web text, our domain-independent system achieves an overall precision of 0.82 and a recall of 0.98. The domain-dependent variation produces very high precision (0.99) at the expense of recall (0.75). These results exhibit a robust level of accuracy suitable for enhancing question answering, information extraction, and summarization.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mysore-gopinath-etal-2018-supervised">
<titleInfo>
<title>Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abhijith</namePart>
<namePart type="given">Athreya</namePart>
<namePart type="family">Mysore Gopinath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shomir</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Norman</namePart>
<namePart type="family">Sadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-oct-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ellen</namePart>
<namePart type="family">Riloff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Hockenmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun’ichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The text in many web documents is organized into a hierarchy of section titles and corresponding prose content, a structure which provides potentially exploitable information on discourse structure and topicality. However, this organization is generally discarded during text collection, and collecting it is not straightforward: the same visual organization can be implemented in a myriad of different ways in the underlying HTML. To remedy this, we present a flexible system for automatically extracting the hierarchical section titles and prose organization of web documents irrespective of differences in HTML representation. This system uses features from syntax, semantics, discourse and markup to build two models which classify HTML text into section titles and prose text. When tested on three different domains of web text, our domain-independent system achieves an overall precision of 0.82 and a recall of 0.98. The domain-dependent variation produces very high precision (0.99) at the expense of recall (0.75). These results exhibit a robust level of accuracy suitable for enhancing question answering, information extraction, and summarization.</abstract>
<identifier type="citekey">mysore-gopinath-etal-2018-supervised</identifier>
<identifier type="doi">10.18653/v1/D18-1099</identifier>
<location>
<url>https://aclanthology.org/D18-1099</url>
</location>
<part>
<date>2018-oct-nov</date>
<extent unit="page">
<start>850</start>
<end>855</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents
%A Mysore Gopinath, Abhijith Athreya
%A Wilson, Shomir
%A Sadeh, Norman
%Y Riloff, Ellen
%Y Chiang, David
%Y Hockenmaier, Julia
%Y Tsujii, Jun’ichi
%S Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing
%D 2018
%8 oct nov
%I Association for Computational Linguistics
%C Brussels, Belgium
%F mysore-gopinath-etal-2018-supervised
%X The text in many web documents is organized into a hierarchy of section titles and corresponding prose content, a structure which provides potentially exploitable information on discourse structure and topicality. However, this organization is generally discarded during text collection, and collecting it is not straightforward: the same visual organization can be implemented in a myriad of different ways in the underlying HTML. To remedy this, we present a flexible system for automatically extracting the hierarchical section titles and prose organization of web documents irrespective of differences in HTML representation. This system uses features from syntax, semantics, discourse and markup to build two models which classify HTML text into section titles and prose text. When tested on three different domains of web text, our domain-independent system achieves an overall precision of 0.82 and a recall of 0.98. The domain-dependent variation produces very high precision (0.99) at the expense of recall (0.75). These results exhibit a robust level of accuracy suitable for enhancing question answering, information extraction, and summarization.
%R 10.18653/v1/D18-1099
%U https://aclanthology.org/D18-1099
%U https://doi.org/10.18653/v1/D18-1099
%P 850-855
Markdown (Informal)
[Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents](https://aclanthology.org/D18-1099) (Mysore Gopinath et al., EMNLP 2018)
ACL