@inproceedings{zhu-etal-2022-knowledge-transfer,
title = "Knowledge Transfer with Visual Prompt in multi-modal Dialogue Understanding and Generation",
author = "Zhu, Minjun and
Weng, Yixuan and
Li, Bin and
He, Shizhu and
Liu, Kang and
Zhao, Jun",
editor = "Dernoncourt, Franck and
Nguyen, Thien Huu and
Lai, Viet Dac and
Veyseh, Amir Pouran Ben and
Bui, Trung H. and
Yoon, David Seunghyun",
booktitle = "Proceedings of the First Workshop On Transcript Understanding",
month = oct,
year = "2022",
address = "Gyeongju, South Korea",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.tu-1.2",
pages = "8--19",
abstract = "Visual Dialogue (VD) task has recently received increasing attention in AI research. Visual Dialog aims to generate multi-round, interactive responses based on the dialog history and image content. Existing textual dialogue models cannot fully understand visual information, resulting in a lack of scene features when communicating with humans continuously. Therefore, how to efficiently fuse multimodal data features remains to be a challenge. In this work, we propose a knowledge transfer method with visual prompt (VPTG) fusing multi-modal data, which is a flexible module that can utilize the text-only seq2seq model to handle visual dialogue tasks. The VPTG conducts text-image co-learning and multi-modal information fusion with visual prompts and visual knowledge distillation. Specifically, we construct visual prompts from visual representations and then induce sequence-to-sequence(seq2seq) models to fuse visual information and textual contexts by visual-text patterns. And we also realize visual knowledge transfer through distillation between two different models{'} text representations, so that the seq2seq model can actively learn visual semantic representations. Extensive experiments on the multi-modal dialogue understanding and generation (MDUG) datasets show the proposed VPTG outperforms other single-modal methods, which demonstrate the effectiveness of visual prompt and visual knowledge transfer.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhu-etal-2022-knowledge-transfer">
<titleInfo>
<title>Knowledge Transfer with Visual Prompt in multi-modal Dialogue Understanding and Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minjun</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixuan</namePart>
<namePart type="family">Weng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhu</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop On Transcript Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thien</namePart>
<namePart type="given">Huu</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viet</namePart>
<namePart type="given">Dac</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="given">Pouran</namePart>
<namePart type="given">Ben</namePart>
<namePart type="family">Veyseh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trung</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Bui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Seunghyun</namePart>
<namePart type="family">Yoon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, South Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Dialogue (VD) task has recently received increasing attention in AI research. Visual Dialog aims to generate multi-round, interactive responses based on the dialog history and image content. Existing textual dialogue models cannot fully understand visual information, resulting in a lack of scene features when communicating with humans continuously. Therefore, how to efficiently fuse multimodal data features remains to be a challenge. In this work, we propose a knowledge transfer method with visual prompt (VPTG) fusing multi-modal data, which is a flexible module that can utilize the text-only seq2seq model to handle visual dialogue tasks. The VPTG conducts text-image co-learning and multi-modal information fusion with visual prompts and visual knowledge distillation. Specifically, we construct visual prompts from visual representations and then induce sequence-to-sequence(seq2seq) models to fuse visual information and textual contexts by visual-text patterns. And we also realize visual knowledge transfer through distillation between two different models’ text representations, so that the seq2seq model can actively learn visual semantic representations. Extensive experiments on the multi-modal dialogue understanding and generation (MDUG) datasets show the proposed VPTG outperforms other single-modal methods, which demonstrate the effectiveness of visual prompt and visual knowledge transfer.</abstract>
<identifier type="citekey">zhu-etal-2022-knowledge-transfer</identifier>
<location>
<url>https://aclanthology.org/2022.tu-1.2</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>8</start>
<end>19</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Knowledge Transfer with Visual Prompt in multi-modal Dialogue Understanding and Generation
%A Zhu, Minjun
%A Weng, Yixuan
%A Li, Bin
%A He, Shizhu
%A Liu, Kang
%A Zhao, Jun
%Y Dernoncourt, Franck
%Y Nguyen, Thien Huu
%Y Lai, Viet Dac
%Y Veyseh, Amir Pouran Ben
%Y Bui, Trung H.
%Y Yoon, David Seunghyun
%S Proceedings of the First Workshop On Transcript Understanding
%D 2022
%8 October
%I International Conference on Computational Linguistics
%C Gyeongju, South Korea
%F zhu-etal-2022-knowledge-transfer
%X Visual Dialogue (VD) task has recently received increasing attention in AI research. Visual Dialog aims to generate multi-round, interactive responses based on the dialog history and image content. Existing textual dialogue models cannot fully understand visual information, resulting in a lack of scene features when communicating with humans continuously. Therefore, how to efficiently fuse multimodal data features remains to be a challenge. In this work, we propose a knowledge transfer method with visual prompt (VPTG) fusing multi-modal data, which is a flexible module that can utilize the text-only seq2seq model to handle visual dialogue tasks. The VPTG conducts text-image co-learning and multi-modal information fusion with visual prompts and visual knowledge distillation. Specifically, we construct visual prompts from visual representations and then induce sequence-to-sequence(seq2seq) models to fuse visual information and textual contexts by visual-text patterns. And we also realize visual knowledge transfer through distillation between two different models’ text representations, so that the seq2seq model can actively learn visual semantic representations. Extensive experiments on the multi-modal dialogue understanding and generation (MDUG) datasets show the proposed VPTG outperforms other single-modal methods, which demonstrate the effectiveness of visual prompt and visual knowledge transfer.
%U https://aclanthology.org/2022.tu-1.2
%P 8-19
Markdown (Informal)
[Knowledge Transfer with Visual Prompt in multi-modal Dialogue Understanding and Generation](https://aclanthology.org/2022.tu-1.2) (Zhu et al., TU 2022)
ACL