@inproceedings{su-etal-2016-topic,
title = "Topic Stability over Noisy Sources",
author = "Su, Jing and
Greene, Derek and
Boydell, Ois{\'\i}n",
editor = "Han, Bo and
Ritter, Alan and
Derczynski, Leon and
Xu, Wei and
Baldwin, Tim",
booktitle = "Proceedings of the 2nd Workshop on Noisy User-generated Text ({WNUT})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-3913",
pages = "85--93",
abstract = "Topic modelling techniques such as LDA have recently been applied to speech transcripts and OCR output. These corpora may contain noisy or erroneous texts which may undermine topic stability. Therefore, it is important to know how well a topic modelling algorithm will perform when applied to noisy data. In this paper we show that different types of textual noise can have diverse effects on the stability of topic models. On the other hand, topic model stability is not consistent with the same type but different levels of noise. We introduce a dictionary filtering approach to address this challenge, with the result that a topic model with the correct number of topics is always identified across different levels of noise.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="su-etal-2016-topic">
<titleInfo>
<title>Topic Stability over Noisy Sources</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="family">Greene</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oisín</namePart>
<namePart type="family">Boydell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Noisy User-generated Text (WNUT)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Topic modelling techniques such as LDA have recently been applied to speech transcripts and OCR output. These corpora may contain noisy or erroneous texts which may undermine topic stability. Therefore, it is important to know how well a topic modelling algorithm will perform when applied to noisy data. In this paper we show that different types of textual noise can have diverse effects on the stability of topic models. On the other hand, topic model stability is not consistent with the same type but different levels of noise. We introduce a dictionary filtering approach to address this challenge, with the result that a topic model with the correct number of topics is always identified across different levels of noise.</abstract>
<identifier type="citekey">su-etal-2016-topic</identifier>
<location>
<url>https://aclanthology.org/W16-3913</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>85</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Topic Stability over Noisy Sources
%A Su, Jing
%A Greene, Derek
%A Boydell, Oisín
%Y Han, Bo
%Y Ritter, Alan
%Y Derczynski, Leon
%Y Xu, Wei
%Y Baldwin, Tim
%S Proceedings of the 2nd Workshop on Noisy User-generated Text (WNUT)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F su-etal-2016-topic
%X Topic modelling techniques such as LDA have recently been applied to speech transcripts and OCR output. These corpora may contain noisy or erroneous texts which may undermine topic stability. Therefore, it is important to know how well a topic modelling algorithm will perform when applied to noisy data. In this paper we show that different types of textual noise can have diverse effects on the stability of topic models. On the other hand, topic model stability is not consistent with the same type but different levels of noise. We introduce a dictionary filtering approach to address this challenge, with the result that a topic model with the correct number of topics is always identified across different levels of noise.
%U https://aclanthology.org/W16-3913
%P 85-93
Markdown (Informal)
[Topic Stability over Noisy Sources](https://aclanthology.org/W16-3913) (Su et al., WNUT 2016)
ACL
- Jing Su, Derek Greene, and Oisín Boydell. 2016. Topic Stability over Noisy Sources. In Proceedings of the 2nd Workshop on Noisy User-generated Text (WNUT), pages 85–93, Osaka, Japan. The COLING 2016 Organizing Committee.