@InProceedings{harwath-glass:2017:Long,
  author    = {Harwath, David  and  Glass, James},
  title     = {Learning Word-Like Units from Joint Audio-Visual Analysis},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {506--517},
  abstract  = {Given a collection of images and spoken audio captions, we present a method for
	discovering word-like acoustic units in the continuous speech signal and
	grounding them to semantically relevant image regions. For example, our model
	is able to detect spoken instances of the word 'lighthouse' within an utterance
	and associate them with image regions containing lighthouses. We do not use any
	form of conventional automatic speech recognition, nor do we use any text
	transcriptions or conventional linguistic annotations. Our model effectively
	implements a form of spoken language acquisition, in which the computer learns
	not only to recognize word categories by sound, but also to enrich the words it
	learns with semantics by grounding them in images.},
  url       = {http://aclweb.org/anthology/P17-1047}
}