@InProceedings{tran-EtAl:2018:N18-11,
  author    = {Tran, Trang  and  Toshniwal, Shubham  and  Bansal, Mohit  and  Gimpel, Kevin  and  Livescu, Karen  and  Ostendorf, Mari},
  title     = {Parsing Speech: a Neural Approach to Integrating Lexical and Acoustic-Prosodic Information},
  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
  month     = {June},
  year      = {2018},
  address   = {New Orleans, Louisiana},
  publisher = {Association for Computational Linguistics},
  pages     = {69--81},
  abstract  = {In conversational speech, the acoustic signal provides cues that help listeners disambiguate difficult parses. For automatically parsing spoken utterances, we introduce a model that integrates transcribed text and acoustic-prosodic features using a convolutional neural network over energy and pitch trajectories coupled with an attention-based recurrent neural network that accepts text and prosodic features. We find that different types of acoustic-prosodic features are individually helpful, and together give statistically significant improvements in parse and disfluency detection F1 scores over a strong text-only baseline. For this study with known sentence boundaries, error analyses show that the main benefit of acoustic-prosodic features is in sentences with disfluencies, attachment decisions are most improved, and transcription errors obscure gains from prosody.},
  url       = {http://www.aclweb.org/anthology/N18-1007}
}

