@inproceedings{ding-etal-2021-semantic,
    title = "Semantic Aligned Multi-modal Transformer for Vision-{L}anguage{U}nderstanding: A Preliminary Study on Visual {QA}",
    author = "Ding, Han  and
      Li, Li Erran  and
      Hu, Zhiting  and
      Xu, Yi  and
      Hakkani-Tur, Dilek  and
      Du, Zheng  and
      Zeng, Belinda",
    editor = "Zadeh, Amir  and
      Morency, Louis-Philippe  and
      Liang, Paul Pu  and
      Ross, Candace  and
      Salakhutdinov, Ruslan  and
      Poria, Soujanya  and
      Cambria, Erik  and
      Shi, Kelly",
    booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence",
    month = jun,
    year = "2021",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.maiworkshop-1.11/",
    doi = "10.18653/v1/2021.maiworkshop-1.11",
    pages = "74--78"
}