@inproceedings{mota-etal-2019-beamseg,
title = "{B}eam{S}eg: A Joint Model for Multi-Document Segmentation and Topic Identification",
author = "Mota, Pedro and
Eskenazi, Maxine and
Coheur, Lu{\'\i}sa",
editor = "Bansal, Mohit and
Villavicencio, Aline",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K19-1054",
doi = "10.18653/v1/K19-1054",
pages = "582--592",
abstract = "We propose BeamSeg, a joint model for segmentation and topic identification of documents from the same domain. The model assumes that lexical cohesion can be observed across documents, meaning that segments describing the same topic use a similar lexical distribution over the vocabulary. The model implements lexical cohesion in an unsupervised Bayesian setting by drawing from the same language model segments with the same topic. Contrary to previous approaches, we assume that language models are not independent, since the vocabulary changes in consecutive segments are expected to be smooth and not abrupt. We achieve this by using a dynamic Dirichlet prior that takes into account data contributions from other topics. BeamSeg also models segment length properties of documents based on modality (textbooks, slides, \textit{etc.}). The evaluation is carried out in three datasets. In two of them, improvements of up to 4.8{\%} and 7.3{\%} are obtained in the segmentation and topic identifications tasks, indicating that both tasks should be jointly modeled.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mota-etal-2019-beamseg">
<titleInfo>
<title>BeamSeg: A Joint Model for Multi-Document Segmentation and Topic Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Mota</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxine</namePart>
<namePart type="family">Eskenazi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luísa</namePart>
<namePart type="family">Coheur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose BeamSeg, a joint model for segmentation and topic identification of documents from the same domain. The model assumes that lexical cohesion can be observed across documents, meaning that segments describing the same topic use a similar lexical distribution over the vocabulary. The model implements lexical cohesion in an unsupervised Bayesian setting by drawing from the same language model segments with the same topic. Contrary to previous approaches, we assume that language models are not independent, since the vocabulary changes in consecutive segments are expected to be smooth and not abrupt. We achieve this by using a dynamic Dirichlet prior that takes into account data contributions from other topics. BeamSeg also models segment length properties of documents based on modality (textbooks, slides, etc.). The evaluation is carried out in three datasets. In two of them, improvements of up to 4.8% and 7.3% are obtained in the segmentation and topic identifications tasks, indicating that both tasks should be jointly modeled.</abstract>
<identifier type="citekey">mota-etal-2019-beamseg</identifier>
<identifier type="doi">10.18653/v1/K19-1054</identifier>
<location>
<url>https://aclanthology.org/K19-1054</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>582</start>
<end>592</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BeamSeg: A Joint Model for Multi-Document Segmentation and Topic Identification
%A Mota, Pedro
%A Eskenazi, Maxine
%A Coheur, Luísa
%Y Bansal, Mohit
%Y Villavicencio, Aline
%S Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F mota-etal-2019-beamseg
%X We propose BeamSeg, a joint model for segmentation and topic identification of documents from the same domain. The model assumes that lexical cohesion can be observed across documents, meaning that segments describing the same topic use a similar lexical distribution over the vocabulary. The model implements lexical cohesion in an unsupervised Bayesian setting by drawing from the same language model segments with the same topic. Contrary to previous approaches, we assume that language models are not independent, since the vocabulary changes in consecutive segments are expected to be smooth and not abrupt. We achieve this by using a dynamic Dirichlet prior that takes into account data contributions from other topics. BeamSeg also models segment length properties of documents based on modality (textbooks, slides, etc.). The evaluation is carried out in three datasets. In two of them, improvements of up to 4.8% and 7.3% are obtained in the segmentation and topic identifications tasks, indicating that both tasks should be jointly modeled.
%R 10.18653/v1/K19-1054
%U https://aclanthology.org/K19-1054
%U https://doi.org/10.18653/v1/K19-1054
%P 582-592
Markdown (Informal)
[BeamSeg: A Joint Model for Multi-Document Segmentation and Topic Identification](https://aclanthology.org/K19-1054) (Mota et al., CoNLL 2019)
ACL