@InProceedings{han-schlangen:2017:I17-2,
  author    = {Han, Ting  and  Schlangen, David},
  title     = {Draw and Tell: Multimodal Descriptions Outperform Verbal- or Sketch-Only Descriptions in an Image Retrieval Task},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
  month     = {November},
  year      = {2017},
  address   = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  pages     = {361--365},
  abstract  = {While language conveys meaning largely symbolically, actual communication acts
	typically contain iconic elements as well: People gesture while they speak, or
	may even draw sketches while explaining something.
	Image retrieval prima facie seems like a task that could profit from combined
	symbolic and iconic reference, but it is typically set up to work either from
	language only, or via (iconic) sketches with no verbal contribution. Using a
	model of grounded language semantics and a model of sketch-to-image mapping, we
	show that adding even very reduced iconic information to a verbal image
	description improves recall. Verbal descriptions paired with fully detailed
	sketches still perform better than these sketches alone. We see these results
	as supporting the assumption that natural user interfaces should respond to
	multimodal input, where possible, rather than just language alone.},
  url       = {http://www.aclweb.org/anthology/I17-2061}
}

