@InProceedings{li-EtAl:2017:EMNLP20171,
  author    = {Li, Haoran  and  Zhu, Junnan  and  Ma, Cong  and  Zhang, Jiajun  and  Zong, Chengqing},
  title     = {Multi-modal Summarization for Asynchronous Collection of Text, Image, Audio and Video},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month     = {September},
  year      = {2017},
  address   = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  pages     = {1092--1102},
  abstract  = {The rapid increase of the multimedia data over the Internet necessitates
	multi-modal summarization from collections of text, image, audio and video.  
	In this work, we propose an extractive Multi-modal Summarization (MMS) method
	which can automatically generate a textual summary given a set of documents,
	images, audios and videos related to a specific topic. The key idea is to
	bridge the semantic gaps between multi-modal contents. For audio information,
	we design an approach to selectively use its transcription. For vision
	information, we learn joint representations of texts and images using a neural
	network. Finally, all the multi-modal aspects are considered to generate the
	textural summary by maximizing the salience, non-redundancy, readability and
	coverage through budgeted optimization of submodular functions.  We further
	introduce an MMS corpus in English and Chinese. The experimental results on
	this dataset demonstrate that our
	method outperforms other competitive baseline methods.},
  url       = {https://www.aclweb.org/anthology/D17-1114}
}

