@InProceedings{vu-EtAl:2018:C18-1,
  author    = {Vu, Hoa  and  Greco, Claudio  and  Erofeeva, Aliia  and  Jafaritazehjan, Somayeh  and  Linders, Guido  and  Tanti, Marc  and  Testoni, Alberto  and  Bernardi, Raffaella  and  Gatt, Albert},
  title     = {Grounded Textual Entailment},
  booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
  month     = {August},
  year      = {2018},
  address   = {Santa Fe, New Mexico, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {2354--2368},
  abstract  = {Capturing semantic relations between sentences, such as entailment, is a long-standing challenge for computational semantics. Logic-based models analyse entailment in terms of possible worlds (interpretations, or situations) where a premise P entails a hypothesis H iff in all worlds where P is true, H is also true. Statistical models view this relationship probabilistically, addressing it in terms of whether a human would likely infer H from P. In this paper, we wish to bridge these two perspectives, by arguing for a visually-grounded version of the Textual Entailment task. Specifically, we ask whether models can perform better if, in addition to P and H, there is also an image (corresponding to the relevant "world" or "situation"). We use a multimodal version of the SNLI dataset (Bowman et al., 2015) and we compare "blind" and visually-augmented models of textual entailment. We show that visual information is beneficial, but we also conduct an in-depth error analysis that reveals that current multimodal models are not performing "grounding" in an optimal fashion.},
  url       = {http://www.aclweb.org/anthology/C18-1199}
}