@article{arnold-etal-2019-sector,
title = "{SECTOR}: A Neural Model for Coherent Topic Segmentation and Classification",
author = {Arnold, Sebastian and
Schneider, Rudolf and
Cudr{\'e}-Mauroux, Philippe and
Gers, Felix A. and
L{\"o}ser, Alexander},
editor = "Lee, Lillian and
Johnson, Mark and
Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "7",
year = "2019",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q19-1011",
doi = "10.1162/tacl_a_00261",
pages = "169--184",
abstract = "When searching for information, a human reader first glances over a document, spots relevant sections, and then focuses on a few sentences for resolving her intention. However, the high variance of document structure complicates the identification of the salient topic of a given section at a glance. To tackle this challenge, we present SECTOR, a model to support machine reading systems by segmenting documents into coherent sections and assigning topic labels to each section. Our deep neural network architecture learns a latent topic embedding over the course of a document. This can be leveraged to classify local topics from plain text and segment a document at topic shifts. In addition, we contribute WikiSection, a publicly available data set with 242k labeled sections in English and German from two distinct domains: diseases and cities. From our extensive evaluation of 20 architectures, we report a highest score of 71.6{\%} F1 for the segmentation and classification of 30 topics from the English city domain, scored by our SECTOR long short-term memory model with Bloom filter embeddings and bidirectional segmentation. This is a significant improvement of 29.5 points F1 over state-of-the-art CNN classifiers with baseline segmentation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arnold-etal-2019-sector">
<titleInfo>
<title>SECTOR: A Neural Model for Coherent Topic Segmentation and Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Arnold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudolf</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Cudré-Mauroux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Gers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Löser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>When searching for information, a human reader first glances over a document, spots relevant sections, and then focuses on a few sentences for resolving her intention. However, the high variance of document structure complicates the identification of the salient topic of a given section at a glance. To tackle this challenge, we present SECTOR, a model to support machine reading systems by segmenting documents into coherent sections and assigning topic labels to each section. Our deep neural network architecture learns a latent topic embedding over the course of a document. This can be leveraged to classify local topics from plain text and segment a document at topic shifts. In addition, we contribute WikiSection, a publicly available data set with 242k labeled sections in English and German from two distinct domains: diseases and cities. From our extensive evaluation of 20 architectures, we report a highest score of 71.6% F1 for the segmentation and classification of 30 topics from the English city domain, scored by our SECTOR long short-term memory model with Bloom filter embeddings and bidirectional segmentation. This is a significant improvement of 29.5 points F1 over state-of-the-art CNN classifiers with baseline segmentation.</abstract>
<identifier type="citekey">arnold-etal-2019-sector</identifier>
<identifier type="doi">10.1162/tacl_a_00261</identifier>
<location>
<url>https://aclanthology.org/Q19-1011</url>
</location>
<part>
<date>2019</date>
<detail type="volume"><number>7</number></detail>
<extent unit="page">
<start>169</start>
<end>184</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T SECTOR: A Neural Model for Coherent Topic Segmentation and Classification
%A Arnold, Sebastian
%A Schneider, Rudolf
%A Cudré-Mauroux, Philippe
%A Gers, Felix A.
%A Löser, Alexander
%J Transactions of the Association for Computational Linguistics
%D 2019
%V 7
%I MIT Press
%C Cambridge, MA
%F arnold-etal-2019-sector
%X When searching for information, a human reader first glances over a document, spots relevant sections, and then focuses on a few sentences for resolving her intention. However, the high variance of document structure complicates the identification of the salient topic of a given section at a glance. To tackle this challenge, we present SECTOR, a model to support machine reading systems by segmenting documents into coherent sections and assigning topic labels to each section. Our deep neural network architecture learns a latent topic embedding over the course of a document. This can be leveraged to classify local topics from plain text and segment a document at topic shifts. In addition, we contribute WikiSection, a publicly available data set with 242k labeled sections in English and German from two distinct domains: diseases and cities. From our extensive evaluation of 20 architectures, we report a highest score of 71.6% F1 for the segmentation and classification of 30 topics from the English city domain, scored by our SECTOR long short-term memory model with Bloom filter embeddings and bidirectional segmentation. This is a significant improvement of 29.5 points F1 over state-of-the-art CNN classifiers with baseline segmentation.
%R 10.1162/tacl_a_00261
%U https://aclanthology.org/Q19-1011
%U https://doi.org/10.1162/tacl_a_00261
%P 169-184
Markdown (Informal)
[SECTOR: A Neural Model for Coherent Topic Segmentation and Classification](https://aclanthology.org/Q19-1011) (Arnold et al., TACL 2019)
ACL