@InProceedings{oshikiri:2017:EMNLP2017,
  author    = {Oshikiri, Takamasa},
  title     = {Segmentation-Free Word Embedding for Unsegmented Languages},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {767--772},
  abstract  = {In this paper, we propose a new pipeline of word embedding for unsegmented
	languages, called segmentation-free word embedding, which does not require word
	segmentation as a preprocessing step. Unlike space-delimited languages,
	unsegmented languages, such as Chinese and Japanese, require word segmentation
	as a preprocessing step. However, word segmentation, that often requires
	manually annotated resources, is difficult and expensive, and unavoidable
	errors in word segmentation affect downstream tasks. To avoid these problems in
	learning word vectors of unsegmented languages, we consider word co-occurrence
	statistics over all possible candidates of segmentations based on frequent
	character n-grams instead of segmented sentences provided by conventional word
	segmenters. Our experiments of noun category prediction tasks on raw Twitter,
	Weibo, and Wikipedia corpora show that the proposed method outperforms the
	conventional approaches that require word segmenters.},
  url       = {https://www.aclweb.org/anthology/D17-1080}
}

