@inproceedings{ding-etal-2021-semantic,
title = "Semantic Aligned Multi-modal Transformer for Vision-{L}anguage{U}nderstanding: A Preliminary Study on Visual {QA}",
author = "Ding, Han and
Li, Li Erran and
Hu, Zhiting and
Xu, Yi and
Hakkani-Tur, Dilek and
Du, Zheng and
Zeng, Belinda",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Ross, Candace and
Salakhutdinov, Ruslan and
Poria, Soujanya and
Cambria, Erik and
Shi, Kelly",
booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence",
month = jun,
year = "2021",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.maiworkshop-1.11/",
doi = "10.18653/v1/2021.maiworkshop-1.11",
pages = "74--78",
abstract = "Recent vision-language understanding approaches adopt a multi-modal transformer pre-training and finetuning paradigm. Prior work learns representations of text tokens and visual features with cross-attention mechanisms and captures the alignment solely based on indirect signals. In this work, we propose to enhance the alignment mechanism by incorporating image scene graph structures as the bridge between the two modalities, and learning with new contrastive objectives. In our preliminary study on the challenging compositional visual question answering task, we show the proposed approach achieves improved results, demonstrating potentials to enhance vision-language understanding."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ding-etal-2021-semantic">
<titleInfo>
<title>Semantic Aligned Multi-modal Transformer for Vision-LanguageUnderstanding: A Preliminary Study on Visual QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="given">Erran</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiting</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Belinda</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Multimodal Artificial Intelligence</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Salakhutdinov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent vision-language understanding approaches adopt a multi-modal transformer pre-training and finetuning paradigm. Prior work learns representations of text tokens and visual features with cross-attention mechanisms and captures the alignment solely based on indirect signals. In this work, we propose to enhance the alignment mechanism by incorporating image scene graph structures as the bridge between the two modalities, and learning with new contrastive objectives. In our preliminary study on the challenging compositional visual question answering task, we show the proposed approach achieves improved results, demonstrating potentials to enhance vision-language understanding.</abstract>
<identifier type="citekey">ding-etal-2021-semantic</identifier>
<identifier type="doi">10.18653/v1/2021.maiworkshop-1.11</identifier>
<location>
<url>https://aclanthology.org/2021.maiworkshop-1.11/</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>74</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantic Aligned Multi-modal Transformer for Vision-LanguageUnderstanding: A Preliminary Study on Visual QA
%A Ding, Han
%A Li, Li Erran
%A Hu, Zhiting
%A Xu, Yi
%A Hakkani-Tur, Dilek
%A Du, Zheng
%A Zeng, Belinda
%Y Zadeh, Amir
%Y Morency, Louis-Philippe
%Y Liang, Paul Pu
%Y Ross, Candace
%Y Salakhutdinov, Ruslan
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Shi, Kelly
%S Proceedings of the Third Workshop on Multimodal Artificial Intelligence
%D 2021
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F ding-etal-2021-semantic
%X Recent vision-language understanding approaches adopt a multi-modal transformer pre-training and finetuning paradigm. Prior work learns representations of text tokens and visual features with cross-attention mechanisms and captures the alignment solely based on indirect signals. In this work, we propose to enhance the alignment mechanism by incorporating image scene graph structures as the bridge between the two modalities, and learning with new contrastive objectives. In our preliminary study on the challenging compositional visual question answering task, we show the proposed approach achieves improved results, demonstrating potentials to enhance vision-language understanding.
%R 10.18653/v1/2021.maiworkshop-1.11
%U https://aclanthology.org/2021.maiworkshop-1.11/
%U https://doi.org/10.18653/v1/2021.maiworkshop-1.11
%P 74-78
Markdown (Informal)
[Semantic Aligned Multi-modal Transformer for Vision-LanguageUnderstanding: A Preliminary Study on Visual QA](https://aclanthology.org/2021.maiworkshop-1.11/) (Ding et al., maiworkshop 2021)
ACL