@InProceedings{zarriess-schlangen:2017:EMNLP2017,
  author    = {Zarrie{\ss}, Sina  and  Schlangen, David},
  title     = {Deriving continous grounded meaning representations from referentially structured multimodal contexts},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {959--965},
  abstract  = {Corpora of referring expressions paired with their visual referents are a good
	source for learning word meanings directly grounded in visual representations.
	Here, we explore additional ways of extracting from them word representations
	linked to  multi-modal context: through expressions that refer to the same
	object, and through expressions that refer to different objects in the same
	scene. We show that continuous meaning representations derived from these
	contexts capture complementary aspects of similarity, , even if not
	outperforming textual embeddings trained on very large amounts of raw text when
	tested on standard similarity benchmarks. We propose a new task for evaluating
	grounded meaning representations---detection of potentially co-referential
	phrases---and show that it requires precise denotational representations of
	attribute meanings, which our method provides.},
  url       = {https://www.aclweb.org/anthology/D17-1100}
}