@InProceedings{shapiro:2016:COLING,
  author    = {Shapiro, Naomi Tachikawa},
  title     = {Splitting compounds with ngrams},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {630--640},
  abstract  = {Compound words with unmarked word boundaries are problematic for many tasks in
	NLP and computational linguistics, including information extraction, machine
	translation, and syllabification. This paper introduces a simple,
	proof-of-concept language modeling approach to automatic compound segmentation,
	as applied to Finnish. This approach utilizes an off-the-shelf morphological
	analyzer to split training words into their constituent morphemes. A language
	model is subsequently trained on ngrams composed of morphemes, morpheme
	boundaries, and word boundaries. Linguistic constraints are then used to weed
	out phonotactically ill-formed segmentations, thereby allowing the language
	model to select the best grammatical segmentation. This approach achieves an
	accuracy of ~97\%.},
  url       = {http://aclweb.org/anthology/C16-1061}
}

