@inproceedings{ilinykh-dobnik-2021-vision,
title = "How Vision Affects Language: Comparing Masked Self-Attention in Uni-Modal and Multi-Modal Transformer",
author = "Ilinykh, Nikolai and
Dobnik, Simon",
editor = "Donatelli, Lucia and
Krishnaswamy, Nikhil and
Lai, Kenneth and
Pustejovsky, James",
booktitle = "Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)",
month = jun,
year = "2021",
address = "Groningen, Netherlands (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.mmsr-1.5",
pages = "45--55",
abstract = "The problem of interpretation of knowledge learned by multi-head self-attention in transformers has been one of the central questions in NLP. However, a lot of work mainly focused on models trained for uni-modal tasks, e.g. machine translation. In this paper, we examine masked self-attention in a multi-modal transformer trained for the task of image captioning. In particular, we test whether the multi-modality of the task objective affects the learned attention patterns. Our visualisations of masked self-attention demonstrate that (i) it can learn general linguistic knowledge of the textual input, and (ii) its attention patterns incorporate artefacts from visual modality even though it has never accessed it directly. We compare our transformer{'}s attention patterns with masked attention in distilgpt-2 tested for uni-modal text generation of image captions. Based on the maps of extracted attention weights, we argue that masked self-attention in image captioning transformer seems to be enhanced with semantic knowledge from images, exemplifying joint language-and-vision information in its attention patterns.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ilinykh-dobnik-2021-vision">
<titleInfo>
<title>How Vision Affects Language: Comparing Masked Self-Attention in Uni-Modal and Multi-Modal Transformer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Krishnaswamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Groningen, Netherlands (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The problem of interpretation of knowledge learned by multi-head self-attention in transformers has been one of the central questions in NLP. However, a lot of work mainly focused on models trained for uni-modal tasks, e.g. machine translation. In this paper, we examine masked self-attention in a multi-modal transformer trained for the task of image captioning. In particular, we test whether the multi-modality of the task objective affects the learned attention patterns. Our visualisations of masked self-attention demonstrate that (i) it can learn general linguistic knowledge of the textual input, and (ii) its attention patterns incorporate artefacts from visual modality even though it has never accessed it directly. We compare our transformer’s attention patterns with masked attention in distilgpt-2 tested for uni-modal text generation of image captions. Based on the maps of extracted attention weights, we argue that masked self-attention in image captioning transformer seems to be enhanced with semantic knowledge from images, exemplifying joint language-and-vision information in its attention patterns.</abstract>
<identifier type="citekey">ilinykh-dobnik-2021-vision</identifier>
<location>
<url>https://aclanthology.org/2021.mmsr-1.5</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>45</start>
<end>55</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Vision Affects Language: Comparing Masked Self-Attention in Uni-Modal and Multi-Modal Transformer
%A Ilinykh, Nikolai
%A Dobnik, Simon
%Y Donatelli, Lucia
%Y Krishnaswamy, Nikhil
%Y Lai, Kenneth
%Y Pustejovsky, James
%S Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR)
%D 2021
%8 June
%I Association for Computational Linguistics
%C Groningen, Netherlands (Online)
%F ilinykh-dobnik-2021-vision
%X The problem of interpretation of knowledge learned by multi-head self-attention in transformers has been one of the central questions in NLP. However, a lot of work mainly focused on models trained for uni-modal tasks, e.g. machine translation. In this paper, we examine masked self-attention in a multi-modal transformer trained for the task of image captioning. In particular, we test whether the multi-modality of the task objective affects the learned attention patterns. Our visualisations of masked self-attention demonstrate that (i) it can learn general linguistic knowledge of the textual input, and (ii) its attention patterns incorporate artefacts from visual modality even though it has never accessed it directly. We compare our transformer’s attention patterns with masked attention in distilgpt-2 tested for uni-modal text generation of image captions. Based on the maps of extracted attention weights, we argue that masked self-attention in image captioning transformer seems to be enhanced with semantic knowledge from images, exemplifying joint language-and-vision information in its attention patterns.
%U https://aclanthology.org/2021.mmsr-1.5
%P 45-55
Markdown (Informal)
[How Vision Affects Language: Comparing Masked Self-Attention in Uni-Modal and Multi-Modal Transformer](https://aclanthology.org/2021.mmsr-1.5) (Ilinykh & Dobnik, MMSR 2021)
ACL