@article{fujii-etal-2017-nonparametric,
title = "Nonparametric {B}ayesian Semi-supervised Word Segmentation",
author = "Fujii, Ryo and
Domoto, Ryo and
Mochihashi, Daichi",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina",
journal = "Transactions of the Association for Computational Linguistics",
volume = "5",
year = "2017",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q17-1013/",
doi = "10.1162/tacl_a_00054",
pages = "179--189",
abstract = "This paper presents a novel hybrid generative/discriminative model of word segmentation based on nonparametric Bayesian methods. Unlike ordinary discriminative word segmentation which relies only on labeled data, our semi-supervised model also leverages a huge amounts of unlabeled text to automatically learn new {\textquotedblleft}words{\textquotedblright}, and further constrains them by using a labeled data to segment non-standard texts such as those found in social networking services. Specifically, our hybrid model combines a discriminative classifier (CRF; Lafferty et al. (2001) and unsupervised word segmentation (NPYLM; Mochihashi et al. (2009)), with a transparent exchange of information between these two model structures within the semi-supervised framework (JESS-CM; Suzuki and Isozaki (2008)). We confirmed that it can appropriately segment non-standard texts like those in Twitter and Weibo and has nearly state-of-the-art accuracy on standard datasets in Japanese, Chinese, and Thai."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fujii-etal-2017-nonparametric">
<titleInfo>
<title>Nonparametric Bayesian Semi-supervised Word Segmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryo</namePart>
<namePart type="family">Fujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryo</namePart>
<namePart type="family">Domoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daichi</namePart>
<namePart type="family">Mochihashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>This paper presents a novel hybrid generative/discriminative model of word segmentation based on nonparametric Bayesian methods. Unlike ordinary discriminative word segmentation which relies only on labeled data, our semi-supervised model also leverages a huge amounts of unlabeled text to automatically learn new “words”, and further constrains them by using a labeled data to segment non-standard texts such as those found in social networking services. Specifically, our hybrid model combines a discriminative classifier (CRF; Lafferty et al. (2001) and unsupervised word segmentation (NPYLM; Mochihashi et al. (2009)), with a transparent exchange of information between these two model structures within the semi-supervised framework (JESS-CM; Suzuki and Isozaki (2008)). We confirmed that it can appropriately segment non-standard texts like those in Twitter and Weibo and has nearly state-of-the-art accuracy on standard datasets in Japanese, Chinese, and Thai.</abstract>
<identifier type="citekey">fujii-etal-2017-nonparametric</identifier>
<identifier type="doi">10.1162/tacl_a_00054</identifier>
<location>
<url>https://aclanthology.org/Q17-1013/</url>
</location>
<part>
<date>2017</date>
<detail type="volume"><number>5</number></detail>
<extent unit="page">
<start>179</start>
<end>189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Nonparametric Bayesian Semi-supervised Word Segmentation
%A Fujii, Ryo
%A Domoto, Ryo
%A Mochihashi, Daichi
%J Transactions of the Association for Computational Linguistics
%D 2017
%V 5
%I MIT Press
%C Cambridge, MA
%F fujii-etal-2017-nonparametric
%X This paper presents a novel hybrid generative/discriminative model of word segmentation based on nonparametric Bayesian methods. Unlike ordinary discriminative word segmentation which relies only on labeled data, our semi-supervised model also leverages a huge amounts of unlabeled text to automatically learn new “words”, and further constrains them by using a labeled data to segment non-standard texts such as those found in social networking services. Specifically, our hybrid model combines a discriminative classifier (CRF; Lafferty et al. (2001) and unsupervised word segmentation (NPYLM; Mochihashi et al. (2009)), with a transparent exchange of information between these two model structures within the semi-supervised framework (JESS-CM; Suzuki and Isozaki (2008)). We confirmed that it can appropriately segment non-standard texts like those in Twitter and Weibo and has nearly state-of-the-art accuracy on standard datasets in Japanese, Chinese, and Thai.
%R 10.1162/tacl_a_00054
%U https://aclanthology.org/Q17-1013/
%U https://doi.org/10.1162/tacl_a_00054
%P 179-189
Markdown (Informal)
[Nonparametric Bayesian Semi-supervised Word Segmentation](https://aclanthology.org/Q17-1013/) (Fujii et al., TACL 2017)
ACL