@inproceedings{le-etal-2019-multimodal, title = "Multimodal Transformer Networks for End-to-End Video-Grounded Dialogue Systems", author = "Le, Hung and Sahoo, Doyen and Chen, Nancy and Hoi, Steven", editor = "Korhonen, Anna and Traum, David and M{\`a}rquez, Llu{\'i}s", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/P19-1564/", doi = "10.18653/v1/P19-1564", pages = "5612--5623" }