@inproceedings{gur-etal-2021-cross-modal,
title = "Cross-Modal Retrieval Augmentation for Multi-Modal Classification",
author = "Gur, Shir and
Neverova, Natalia and
Stauffer, Chris and
Lim, Ser-Nam and
Kiela, Douwe and
Reiter, Austin",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.11",
doi = "10.18653/v1/2021.findings-emnlp.11",
pages = "111--123",
abstract = "Recent advances in using retrieval components over external knowledge sources have shown impressive results for a variety of downstream tasks in natural language processing. Here, we explore the use of unstructured external knowledge sources of images and their corresponding captions for improving visual question answering (VQA). First, we train a novel alignment model for embedding images and captions in the same space, which achieves substantial improvement in performance on image-caption retrieval w.r.t. similar methods. Second, we show that retrieval-augmented multi-modal transformers using the trained alignment model improve results on VQA over strong baselines. We further conduct extensive experiments to establish the promise of this approach, and examine novel applications for inference time such as hot-swapping indices.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gur-etal-2021-cross-modal">
<titleInfo>
<title>Cross-Modal Retrieval Augmentation for Multi-Modal Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shir</namePart>
<namePart type="family">Gur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalia</namePart>
<namePart type="family">Neverova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Stauffer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ser-Nam</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Douwe</namePart>
<namePart type="family">Kiela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Austin</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advances in using retrieval components over external knowledge sources have shown impressive results for a variety of downstream tasks in natural language processing. Here, we explore the use of unstructured external knowledge sources of images and their corresponding captions for improving visual question answering (VQA). First, we train a novel alignment model for embedding images and captions in the same space, which achieves substantial improvement in performance on image-caption retrieval w.r.t. similar methods. Second, we show that retrieval-augmented multi-modal transformers using the trained alignment model improve results on VQA over strong baselines. We further conduct extensive experiments to establish the promise of this approach, and examine novel applications for inference time such as hot-swapping indices.</abstract>
<identifier type="citekey">gur-etal-2021-cross-modal</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.11</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.11</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>111</start>
<end>123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-Modal Retrieval Augmentation for Multi-Modal Classification
%A Gur, Shir
%A Neverova, Natalia
%A Stauffer, Chris
%A Lim, Ser-Nam
%A Kiela, Douwe
%A Reiter, Austin
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F gur-etal-2021-cross-modal
%X Recent advances in using retrieval components over external knowledge sources have shown impressive results for a variety of downstream tasks in natural language processing. Here, we explore the use of unstructured external knowledge sources of images and their corresponding captions for improving visual question answering (VQA). First, we train a novel alignment model for embedding images and captions in the same space, which achieves substantial improvement in performance on image-caption retrieval w.r.t. similar methods. Second, we show that retrieval-augmented multi-modal transformers using the trained alignment model improve results on VQA over strong baselines. We further conduct extensive experiments to establish the promise of this approach, and examine novel applications for inference time such as hot-swapping indices.
%R 10.18653/v1/2021.findings-emnlp.11
%U https://aclanthology.org/2021.findings-emnlp.11
%U https://doi.org/10.18653/v1/2021.findings-emnlp.11
%P 111-123
Markdown (Informal)
[Cross-Modal Retrieval Augmentation for Multi-Modal Classification](https://aclanthology.org/2021.findings-emnlp.11) (Gur et al., Findings 2021)
ACL
- Shir Gur, Natalia Neverova, Chris Stauffer, Ser-Nam Lim, Douwe Kiela, and Austin Reiter. 2021. Cross-Modal Retrieval Augmentation for Multi-Modal Classification. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 111–123, Punta Cana, Dominican Republic. Association for Computational Linguistics.