@inproceedings{su-etal-2023-pandagpt,
title = "{P}anda{GPT}: One Model To Instruction-Follow Them All",
author = "Su, Yixuan and
Lan, Tian and
Li, Huayang and
Xu, Jialu and
Wang, Yan and
Cai, Deng",
editor = "Hazarika, Devamanyu and
Tang, Xiangru Robert and
Jin, Di",
booktitle = "Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants!",
month = sep,
year = "2023",
address = "Prague, Czech Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.tllm-1.2",
pages = "11--23",
abstract = "We present PandaGPT, an approach to emPower large lANguage moDels with visual and Auditory instruction-following capabilities. Our pilot experiments show that PandaGPT can perform complex tasks such as detailed image description generation, writing stories inspired by videos, and answering questions about audios. More interestingly, PandaGPT can take multimodal inputs simultaneously and compose their semantics naturally. For example, PandaGPT can connect how objects look in an image/video and how they sound in an audio. To do so, PandaGPT combines the multimodal encoders from ImageBind and the large language models from Vicuna. Notably, only aligned image-text pairs are required for the training of PandaGPT. Thanks to the strong capability of ImageBind in embedding data from different modalities into the same space, PandaGPT displays emergent, i.e. zero-shot, cross-modal behaviors for data other than image and text (e.g., video, audio, depth, thermal, and IMU). We hope that PandaGPT serves as an initial step toward building AGI that can perceive and understand inputs in different modalities holistically, as we humans do.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="su-etal-2023-pandagpt">
<titleInfo>
<title>PandaGPT: One Model To Instruction-Follow Them All</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yixuan</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tian</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huayang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jialu</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deng</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants!</title>
</titleInfo>
<name type="personal">
<namePart type="given">Devamanyu</namePart>
<namePart type="family">Hazarika</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangru</namePart>
<namePart type="given">Robert</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czech Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present PandaGPT, an approach to emPower large lANguage moDels with visual and Auditory instruction-following capabilities. Our pilot experiments show that PandaGPT can perform complex tasks such as detailed image description generation, writing stories inspired by videos, and answering questions about audios. More interestingly, PandaGPT can take multimodal inputs simultaneously and compose their semantics naturally. For example, PandaGPT can connect how objects look in an image/video and how they sound in an audio. To do so, PandaGPT combines the multimodal encoders from ImageBind and the large language models from Vicuna. Notably, only aligned image-text pairs are required for the training of PandaGPT. Thanks to the strong capability of ImageBind in embedding data from different modalities into the same space, PandaGPT displays emergent, i.e. zero-shot, cross-modal behaviors for data other than image and text (e.g., video, audio, depth, thermal, and IMU). We hope that PandaGPT serves as an initial step toward building AGI that can perceive and understand inputs in different modalities holistically, as we humans do.</abstract>
<identifier type="citekey">su-etal-2023-pandagpt</identifier>
<location>
<url>https://aclanthology.org/2023.tllm-1.2</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>11</start>
<end>23</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PandaGPT: One Model To Instruction-Follow Them All
%A Su, Yixuan
%A Lan, Tian
%A Li, Huayang
%A Xu, Jialu
%A Wang, Yan
%A Cai, Deng
%Y Hazarika, Devamanyu
%Y Tang, Xiangru Robert
%Y Jin, Di
%S Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants!
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czech Republic
%F su-etal-2023-pandagpt
%X We present PandaGPT, an approach to emPower large lANguage moDels with visual and Auditory instruction-following capabilities. Our pilot experiments show that PandaGPT can perform complex tasks such as detailed image description generation, writing stories inspired by videos, and answering questions about audios. More interestingly, PandaGPT can take multimodal inputs simultaneously and compose their semantics naturally. For example, PandaGPT can connect how objects look in an image/video and how they sound in an audio. To do so, PandaGPT combines the multimodal encoders from ImageBind and the large language models from Vicuna. Notably, only aligned image-text pairs are required for the training of PandaGPT. Thanks to the strong capability of ImageBind in embedding data from different modalities into the same space, PandaGPT displays emergent, i.e. zero-shot, cross-modal behaviors for data other than image and text (e.g., video, audio, depth, thermal, and IMU). We hope that PandaGPT serves as an initial step toward building AGI that can perceive and understand inputs in different modalities holistically, as we humans do.
%U https://aclanthology.org/2023.tllm-1.2
%P 11-23
Markdown (Informal)
[PandaGPT: One Model To Instruction-Follow Them All](https://aclanthology.org/2023.tllm-1.2) (Su et al., TLLM-WS 2023)
ACL
- Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. PandaGPT: One Model To Instruction-Follow Them All. In Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants!, pages 11–23, Prague, Czech Republic. Association for Computational Linguistics.