@InProceedings{gu-EtAl:2018:C18-12,
  author    = {Gu, Yue  and  Yang, Kangning  and  Fu, Shiyu  and  Chen, Shuhong  and  Li, Xinyu  and  Marsic, Ivan},
  title     = {Hybrid Attention based Multimodal Network for Spoken Language Classification},
  booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
  month     = {August},
  year      = {2018},
  address   = {Santa Fe, New Mexico, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {2379--2390},
  abstract  = {We examine the utility of linguistic content and vocal characteristics for multimodal deep learning in human spoken language understanding. We present a deep multimodal network with both feature attention and modality attention to classify utterance-level speech data. The proposed hybrid attention architecture helps the system focus on learning informative representations for both modality-specific feature extraction and model fusion. The experimental results show that our system achieves state-of-the-art or competitive results on three published multimodal datasets. We also demonstrated the effectiveness and generalization of our system on a medical speech dataset from an actual trauma scenario. Furthermore, we provided a detailed comparison and analysis of traditional approaches and deep learning methods on both feature extraction and fusion.},
  url       = {http://www.aclweb.org/anthology/C18-1201}
}