@inproceedings{varuna-krishna-etal-2023-imaginator,
title = "{IMAGINATOR}: Pre-Trained {I}mage+{T}ext Joint Embeddings using Word-Level Grounding of Images",
author = "Kolla, Varuna Krishna and
Suresh, Suryavardan and
Mishra, Shreyash and
Ramamoorthy, Sathyanarayanan and
Patwa, Parth and
Chakraborty, Megha and
Chadha, Aman and
Das, Amitava and
Sheth, Amit",
editor = "D. Pawar, Jyoti and
Lalitha Devi, Sobha",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.1",
pages = "1--18",
abstract = "Word embeddings, i.e., semantically meaningful vector representation of words, are largely influenced by the distributional hypothesis You shall know a word by the company it keeps (Harris, 1954), whereas modern prediction- based neural network embeddings rely on de- sign choices and hyperparameter optimization. Word embeddings like Word2Vec, GloVe etc. well capture the contextuality and real-world analogies but contemporary convolution-based image embeddings such as VGGNet, AlexNet, etc. do not capture contextual knowledge. The popular king-queen analogy does not hold true for most commonly used vision embeddings. In this paper, we introduce a pre-trained joint embedding (JE), named IMAGINATOR, trained on 21K distinct image objects. JE is a way to encode multimodal data into a vec- tor space where the text modality serves as the grounding key, which the complementary modality (in this case, the image) is anchored with. IMAGINATOR encapsulates three in- dividual representations: (i) object-object co- location, (ii) word-object co-location, and (iii) word-object correlation. These three ways cap- ture complementary aspects of the two modal- ities which are further combined to obtain the final object-word JEs. Generated JEs are intrinsically evaluated to assess how well they capture the contextual- ity and real-world analogies. We also evalu- ate pre-trained IMAGINATOR JEs on three downstream tasks: (i) image captioning, (ii) Im- age2Tweet, and (iii) text-based image retrieval. IMAGINATOR establishes a new standard on the aforementioned downstream tasks by out- performing the current SoTA on all the selected tasks. The code is available at https:// github.com/varunakk/IMAGINATOR.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="varuna-krishna-etal-2023-imaginator">
<titleInfo>
<title>IMAGINATOR: Pre-Trained Image+Text Joint Embeddings using Word-Level Grounding of Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Varuna</namePart>
<namePart type="given">Krishna</namePart>
<namePart type="family">Kolla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suryavardan</namePart>
<namePart type="family">Suresh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreyash</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sathyanarayanan</namePart>
<namePart type="family">Ramamoorthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parth</namePart>
<namePart type="family">Patwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Megha</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Chadha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amitava</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amit</namePart>
<namePart type="family">Sheth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jyoti</namePart>
<namePart type="family">D. Pawar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word embeddings, i.e., semantically meaningful vector representation of words, are largely influenced by the distributional hypothesis You shall know a word by the company it keeps (Harris, 1954), whereas modern prediction- based neural network embeddings rely on de- sign choices and hyperparameter optimization. Word embeddings like Word2Vec, GloVe etc. well capture the contextuality and real-world analogies but contemporary convolution-based image embeddings such as VGGNet, AlexNet, etc. do not capture contextual knowledge. The popular king-queen analogy does not hold true for most commonly used vision embeddings. In this paper, we introduce a pre-trained joint embedding (JE), named IMAGINATOR, trained on 21K distinct image objects. JE is a way to encode multimodal data into a vec- tor space where the text modality serves as the grounding key, which the complementary modality (in this case, the image) is anchored with. IMAGINATOR encapsulates three in- dividual representations: (i) object-object co- location, (ii) word-object co-location, and (iii) word-object correlation. These three ways cap- ture complementary aspects of the two modal- ities which are further combined to obtain the final object-word JEs. Generated JEs are intrinsically evaluated to assess how well they capture the contextual- ity and real-world analogies. We also evalu- ate pre-trained IMAGINATOR JEs on three downstream tasks: (i) image captioning, (ii) Im- age2Tweet, and (iii) text-based image retrieval. IMAGINATOR establishes a new standard on the aforementioned downstream tasks by out- performing the current SoTA on all the selected tasks. The code is available at https:// github.com/varunakk/IMAGINATOR.</abstract>
<identifier type="citekey">varuna-krishna-etal-2023-imaginator</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.1</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>1</start>
<end>18</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IMAGINATOR: Pre-Trained Image+Text Joint Embeddings using Word-Level Grounding of Images
%A Kolla, Varuna Krishna
%A Suresh, Suryavardan
%A Mishra, Shreyash
%A Ramamoorthy, Sathyanarayanan
%A Patwa, Parth
%A Chakraborty, Megha
%A Chadha, Aman
%A Das, Amitava
%A Sheth, Amit
%Y D. Pawar, Jyoti
%Y Lalitha Devi, Sobha
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F varuna-krishna-etal-2023-imaginator
%X Word embeddings, i.e., semantically meaningful vector representation of words, are largely influenced by the distributional hypothesis You shall know a word by the company it keeps (Harris, 1954), whereas modern prediction- based neural network embeddings rely on de- sign choices and hyperparameter optimization. Word embeddings like Word2Vec, GloVe etc. well capture the contextuality and real-world analogies but contemporary convolution-based image embeddings such as VGGNet, AlexNet, etc. do not capture contextual knowledge. The popular king-queen analogy does not hold true for most commonly used vision embeddings. In this paper, we introduce a pre-trained joint embedding (JE), named IMAGINATOR, trained on 21K distinct image objects. JE is a way to encode multimodal data into a vec- tor space where the text modality serves as the grounding key, which the complementary modality (in this case, the image) is anchored with. IMAGINATOR encapsulates three in- dividual representations: (i) object-object co- location, (ii) word-object co-location, and (iii) word-object correlation. These three ways cap- ture complementary aspects of the two modal- ities which are further combined to obtain the final object-word JEs. Generated JEs are intrinsically evaluated to assess how well they capture the contextual- ity and real-world analogies. We also evalu- ate pre-trained IMAGINATOR JEs on three downstream tasks: (i) image captioning, (ii) Im- age2Tweet, and (iii) text-based image retrieval. IMAGINATOR establishes a new standard on the aforementioned downstream tasks by out- performing the current SoTA on all the selected tasks. The code is available at https:// github.com/varunakk/IMAGINATOR.
%U https://aclanthology.org/2023.icon-1.1
%P 1-18
Markdown (Informal)
[IMAGINATOR: Pre-Trained Image+Text Joint Embeddings using Word-Level Grounding of Images](https://aclanthology.org/2023.icon-1.1) (Kolla et al., ICON 2023)
ACL
- Varuna Krishna Kolla, Suryavardan Suresh, Shreyash Mishra, Sathyanarayanan Ramamoorthy, Parth Patwa, Megha Chakraborty, Aman Chadha, Amitava Das, and Amit Sheth. 2023. IMAGINATOR: Pre-Trained Image+Text Joint Embeddings using Word-Level Grounding of Images. In Proceedings of the 20th International Conference on Natural Language Processing (ICON), pages 1–18, Goa University, Goa, India. NLP Association of India (NLPAI).