@InProceedings{collell-moens:2016:COLING,
  author    = {Collell, Guillem  and  Moens, Marie-Francine},
  title     = {Is an Image Worth More than a Thousand Words? On the Fine-Grain Semantic Differences between Visual and Linguistic Representations},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {2807--2817},
  abstract  = {Human concept representations are often grounded with visual information, yet
	some aspects of meaning cannot be visually represented or are better described
	with language. Thus, vision and language provide complementary information
	that, properly combined, can potentially yield more complete concept
	representations. Recently, state-of-the-art distributional semantic models and
	convolutional neural networks have achieved great success in representing
	linguistic and visual knowledge respectively. In this paper, we compare both,
	visual and linguistic representations in their ability to capture different
	types of fine-grain semantic knowledge---or attributes---of concepts. Humans
	often describe objects using attributes, that is, properties such as shape,
	color or functionality, which often transcend the linguistic and visual
	modalities. In our setting, we evaluate how well attributes can be predicted by
	using the unimodal representations as inputs. We are interested in first,
	finding out whether attributes are generally better captured by either the
	vision or by the language modality; and second, if none of them is clearly
	superior (as we hypothesize), what type of attributes or semantic knowledge are
	better encoded from each modality. Ultimately, our study sheds light on the
	potential of combining visual and textual representations.},
  url       = {http://aclweb.org/anthology/C16-1264}
}

