@inproceedings{shahmohammadi-etal-2021-learning,
title = "Learning Zero-Shot Multifaceted Visually Grounded Word Embeddings via Multi-Task Training",
author = "Shahmohammadi, Hassan and
Lensch, Hendrik P. A. and
Baayen, R. Harald",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.12/",
doi = "10.18653/v1/2021.conll-1.12",
pages = "158--170",
abstract = "Language grounding aims at linking the symbolic representation of language (e.g., words) into the rich perceptual knowledge of the outside world. The general approach is to embed both textual and visual information into a common space -the grounded space- confined by an explicit relationship. We argue that since concrete and abstract words are processed differently in the brain, such approaches sacrifice the abstract knowledge obtained from textual statistics in the process of acquiring perceptual information. The focus of this paper is to solve this issue by implicitly grounding the word embeddings. Rather than learning two mappings into a joint space, our approach integrates modalities by implicit alignment. This is achieved by learning a reversible mapping between the textual and the grounded space by means of multi-task training. Intrinsic and extrinsic evaluations show that our way of visual grounding is highly beneficial for both abstract and concrete words. Our embeddings are correlated with human judgments and outperform previous works using pretrained word embeddings on a wide range of benchmarks. Our grounded embeddings are publicly available here."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shahmohammadi-etal-2021-learning">
<titleInfo>
<title>Learning Zero-Shot Multifaceted Visually Grounded Word Embeddings via Multi-Task Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hassan</namePart>
<namePart type="family">Shahmohammadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hendrik</namePart>
<namePart type="given">P</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Lensch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="given">Harald</namePart>
<namePart type="family">Baayen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language grounding aims at linking the symbolic representation of language (e.g., words) into the rich perceptual knowledge of the outside world. The general approach is to embed both textual and visual information into a common space -the grounded space- confined by an explicit relationship. We argue that since concrete and abstract words are processed differently in the brain, such approaches sacrifice the abstract knowledge obtained from textual statistics in the process of acquiring perceptual information. The focus of this paper is to solve this issue by implicitly grounding the word embeddings. Rather than learning two mappings into a joint space, our approach integrates modalities by implicit alignment. This is achieved by learning a reversible mapping between the textual and the grounded space by means of multi-task training. Intrinsic and extrinsic evaluations show that our way of visual grounding is highly beneficial for both abstract and concrete words. Our embeddings are correlated with human judgments and outperform previous works using pretrained word embeddings on a wide range of benchmarks. Our grounded embeddings are publicly available here.</abstract>
<identifier type="citekey">shahmohammadi-etal-2021-learning</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.12</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.12/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>158</start>
<end>170</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning Zero-Shot Multifaceted Visually Grounded Word Embeddings via Multi-Task Training
%A Shahmohammadi, Hassan
%A Lensch, Hendrik P. A.
%A Baayen, R. Harald
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F shahmohammadi-etal-2021-learning
%X Language grounding aims at linking the symbolic representation of language (e.g., words) into the rich perceptual knowledge of the outside world. The general approach is to embed both textual and visual information into a common space -the grounded space- confined by an explicit relationship. We argue that since concrete and abstract words are processed differently in the brain, such approaches sacrifice the abstract knowledge obtained from textual statistics in the process of acquiring perceptual information. The focus of this paper is to solve this issue by implicitly grounding the word embeddings. Rather than learning two mappings into a joint space, our approach integrates modalities by implicit alignment. This is achieved by learning a reversible mapping between the textual and the grounded space by means of multi-task training. Intrinsic and extrinsic evaluations show that our way of visual grounding is highly beneficial for both abstract and concrete words. Our embeddings are correlated with human judgments and outperform previous works using pretrained word embeddings on a wide range of benchmarks. Our grounded embeddings are publicly available here.
%R 10.18653/v1/2021.conll-1.12
%U https://aclanthology.org/2021.conll-1.12/
%U https://doi.org/10.18653/v1/2021.conll-1.12
%P 158-170
Markdown (Informal)
[Learning Zero-Shot Multifaceted Visually Grounded Word Embeddings via Multi-Task Training](https://aclanthology.org/2021.conll-1.12/) (Shahmohammadi et al., CoNLL 2021)
ACL