@InProceedings{bergmanis-goldwater:2017:EACLlong,
  author    = {Bergmanis, Toms  and  Goldwater, Sharon},
  title     = {From Segmentation to Analyses: a Probabilistic Model for Unsupervised Morphology Induction},
  booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
  month     = {April},
  year      = {2017},
  address   = {Valencia, Spain},
  publisher = {Association for Computational Linguistics},
  pages     = {337--346},
  abstract  = {A major motivation for unsupervised morphological analysis is to reduce the
	sparse data problem in under-resourced languages. Most previous work focus on
	segmenting surface forms into their constituent morphs (taking: tak +ing), but
	surface form segmentation does not solve the sparse data problem as the
	analyses of take and taking are not connected to each other. We present a
	system that adapts the MorphoChains system (Narasimhan et al., 2015) to provide
	morphological analyses that aim to abstract over spelling differences in
	functionally similar morphs. This results in analyses that are not compelled to
	use all the orthographic material of a word (stopping: stop +ing) or limited to
	only that material (acidified: acid +ify +ed). On average across six
	typologically varied languages our system has a similar or better F-score on
	EMMA (a measure of underlying morpheme accuracy) than three strong baselines;
	moreover, the total number of distinct morphemes identified by our system is on
	average 12.8% lower than for Morfessor (Virpioja et al., 2013), a
	state-of-the-art surface segmentation system.},
  url       = {http://www.aclweb.org/anthology/E17-1032}
}

