@article{TACL1037,
	author = {Fujii, Ryo  and Domoto, Ryo  and Mochihashi, Daichi },
	title = {Nonparametric Bayesian Semi-supervised Word Segmentation},
	journal = {Transactions of the Association for Computational Linguistics},
	volume = {5},
	year = {2017},
	keywords = {},
	abstract = {This paper presents a novel hybrid generative/discriminative model of word segmentation based on nonparametric Bayesian methods.  Unlike ordinary discriminative word segmentation which relies only on labeled data, our semi-supervised model also leverages a huge amounts of unlabeled text to automatically learn new \"words'', and further constrains them by using a labeled data to segment non-standard texts such as those found in social networking services.Specifically, our hybrid model combines a discriminative classifier (CRF; Lafferty et al. (2001)) and unsupervised word segmentation (NPYLM; Mochihashi et al. (2009)) with a transparent exchange of information between these two model structures within the semi-supervised framework (JESS-CM; Suzuki et al. (2008)).  We confirmed that it can appropriately segment non-standard texts like those in Twitter and Weibo and has nearly state-of-the-art accuracy on standard datasets in Japanese, Chinese, and Thai.},
	issn = {2307-387X},
	url = {https://transacl.org/ojs/index.php/tacl/article/view/1037},
	pages = {179--189}
}
