@inproceedings{ding-etal-2021-semantic, title = "Semantic Aligned Multi-modal Transformer for Vision-{L}anguage{U}nderstanding: A Preliminary Study on Visual {QA}", author = "Ding, Han and Li, Li Erran and Hu, Zhiting and Xu, Yi and Hakkani-Tur, Dilek and Du, Zheng and Zeng, Belinda", editor = "Zadeh, Amir and Morency, Louis-Philippe and Liang, Paul Pu and Ross, Candace and Salakhutdinov, Ruslan and Poria, Soujanya and Cambria, Erik and Shi, Kelly", booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence", month = jun, year = "2021", address = "Mexico City, Mexico", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.maiworkshop-1.11/", doi = "10.18653/v1/2021.maiworkshop-1.11", pages = "74--78" }