@inproceedings{xing-etal-2021-km,
title = "{KM}-{BART}: Knowledge Enhanced Multimodal {BART} for Visual Commonsense Generation",
author = "Xing, Yiran and
Shi, Zai and
Meng, Zhao and
Lakemeyer, Gerhard and
Ma, Yunpu and
Wattenhofer, Roger",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.44",
doi = "10.18653/v1/2021.acl-long.44",
pages = "525--535",
abstract = "We present Knowledge Enhanced Multimodal BART (KM-BART), which is a Transformer-based sequence-to-sequence model capable of reasoning about commonsense knowledge from multimodal inputs of images and texts. We adapt the generative BART architecture (Lewis et al., 2020) to a multimodal model with visual and textual inputs. We further develop novel pretraining tasks to improve the model performance on the Visual Commonsense Generation (VCG) task. In particular, our pretraining task of Knowledge-based Commonsense Generation (KCG) boosts model performance on the VCG task by leveraging commonsense knowledge from a large language model pretrained on external commonsense knowledge graphs. To the best of our knowledge, we are the first to propose a dedicated task for improving model performance on the VCG task. Experimental results show that our model reaches state-of-the-art performance on the VCG task (Park et al., 2020) by applying these novel pretraining tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xing-etal-2021-km">
<titleInfo>
<title>KM-BART: Knowledge Enhanced Multimodal BART for Visual Commonsense Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiran</namePart>
<namePart type="family">Xing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zai</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhao</namePart>
<namePart type="family">Meng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerhard</namePart>
<namePart type="family">Lakemeyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunpu</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roger</namePart>
<namePart type="family">Wattenhofer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present Knowledge Enhanced Multimodal BART (KM-BART), which is a Transformer-based sequence-to-sequence model capable of reasoning about commonsense knowledge from multimodal inputs of images and texts. We adapt the generative BART architecture (Lewis et al., 2020) to a multimodal model with visual and textual inputs. We further develop novel pretraining tasks to improve the model performance on the Visual Commonsense Generation (VCG) task. In particular, our pretraining task of Knowledge-based Commonsense Generation (KCG) boosts model performance on the VCG task by leveraging commonsense knowledge from a large language model pretrained on external commonsense knowledge graphs. To the best of our knowledge, we are the first to propose a dedicated task for improving model performance on the VCG task. Experimental results show that our model reaches state-of-the-art performance on the VCG task (Park et al., 2020) by applying these novel pretraining tasks.</abstract>
<identifier type="citekey">xing-etal-2021-km</identifier>
<identifier type="doi">10.18653/v1/2021.acl-long.44</identifier>
<location>
<url>https://aclanthology.org/2021.acl-long.44</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>525</start>
<end>535</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KM-BART: Knowledge Enhanced Multimodal BART for Visual Commonsense Generation
%A Xing, Yiran
%A Shi, Zai
%A Meng, Zhao
%A Lakemeyer, Gerhard
%A Ma, Yunpu
%A Wattenhofer, Roger
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F xing-etal-2021-km
%X We present Knowledge Enhanced Multimodal BART (KM-BART), which is a Transformer-based sequence-to-sequence model capable of reasoning about commonsense knowledge from multimodal inputs of images and texts. We adapt the generative BART architecture (Lewis et al., 2020) to a multimodal model with visual and textual inputs. We further develop novel pretraining tasks to improve the model performance on the Visual Commonsense Generation (VCG) task. In particular, our pretraining task of Knowledge-based Commonsense Generation (KCG) boosts model performance on the VCG task by leveraging commonsense knowledge from a large language model pretrained on external commonsense knowledge graphs. To the best of our knowledge, we are the first to propose a dedicated task for improving model performance on the VCG task. Experimental results show that our model reaches state-of-the-art performance on the VCG task (Park et al., 2020) by applying these novel pretraining tasks.
%R 10.18653/v1/2021.acl-long.44
%U https://aclanthology.org/2021.acl-long.44
%U https://doi.org/10.18653/v1/2021.acl-long.44
%P 525-535
Markdown (Informal)
[KM-BART: Knowledge Enhanced Multimodal BART for Visual Commonsense Generation](https://aclanthology.org/2021.acl-long.44) (Xing et al., ACL-IJCNLP 2021)
ACL
- Yiran Xing, Zai Shi, Zhao Meng, Gerhard Lakemeyer, Yunpu Ma, and Roger Wattenhofer. 2021. KM-BART: Knowledge Enhanced Multimodal BART for Visual Commonsense Generation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pages 525–535, Online. Association for Computational Linguistics.