@inproceedings{samaran-etal-2021-attending,
title = "Attending Self-Attention: A Case Study of Visually Grounded Supervision in Vision-and-Language Transformers",
author = "Samaran, Jules and
Garcia, Noa and
Otani, Mayu and
Chu, Chenhui and
Nakashima, Yuta",
editor = "Kabbara, Jad and
Lin, Haitao and
Paullada, Amandalynne and
Vamvas, Jannis",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-srw.8",
doi = "10.18653/v1/2021.acl-srw.8",
pages = "81--86",
abstract = "The impressive performances of pre-trained visually grounded language models have motivated a growing body of research investigating what has been learned during the pre-training. As a lot of these models are based on Transformers, several studies on the attention mechanisms used by the models to learn to associate phrases with their visual grounding in the image have been conducted. In this work, we investigate how supervising attention directly to learn visual grounding can affect the behavior of such models. We compare three different methods on attention supervision and their impact on the performances of a state-of-the-art visually grounded language model on two popular vision-and-language tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samaran-etal-2021-attending">
<titleInfo>
<title>Attending Self-Attention: A Case Study of Visually Grounded Supervision in Vision-and-Language Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jules</namePart>
<namePart type="family">Samaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noa</namePart>
<namePart type="family">Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayu</namePart>
<namePart type="family">Otani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhui</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuta</namePart>
<namePart type="family">Nakashima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jad</namePart>
<namePart type="family">Kabbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amandalynne</namePart>
<namePart type="family">Paullada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jannis</namePart>
<namePart type="family">Vamvas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The impressive performances of pre-trained visually grounded language models have motivated a growing body of research investigating what has been learned during the pre-training. As a lot of these models are based on Transformers, several studies on the attention mechanisms used by the models to learn to associate phrases with their visual grounding in the image have been conducted. In this work, we investigate how supervising attention directly to learn visual grounding can affect the behavior of such models. We compare three different methods on attention supervision and their impact on the performances of a state-of-the-art visually grounded language model on two popular vision-and-language tasks.</abstract>
<identifier type="citekey">samaran-etal-2021-attending</identifier>
<identifier type="doi">10.18653/v1/2021.acl-srw.8</identifier>
<location>
<url>https://aclanthology.org/2021.acl-srw.8</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>81</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Attending Self-Attention: A Case Study of Visually Grounded Supervision in Vision-and-Language Transformers
%A Samaran, Jules
%A Garcia, Noa
%A Otani, Mayu
%A Chu, Chenhui
%A Nakashima, Yuta
%Y Kabbara, Jad
%Y Lin, Haitao
%Y Paullada, Amandalynne
%Y Vamvas, Jannis
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F samaran-etal-2021-attending
%X The impressive performances of pre-trained visually grounded language models have motivated a growing body of research investigating what has been learned during the pre-training. As a lot of these models are based on Transformers, several studies on the attention mechanisms used by the models to learn to associate phrases with their visual grounding in the image have been conducted. In this work, we investigate how supervising attention directly to learn visual grounding can affect the behavior of such models. We compare three different methods on attention supervision and their impact on the performances of a state-of-the-art visually grounded language model on two popular vision-and-language tasks.
%R 10.18653/v1/2021.acl-srw.8
%U https://aclanthology.org/2021.acl-srw.8
%U https://doi.org/10.18653/v1/2021.acl-srw.8
%P 81-86
Markdown (Informal)
[Attending Self-Attention: A Case Study of Visually Grounded Supervision in Vision-and-Language Transformers](https://aclanthology.org/2021.acl-srw.8) (Samaran et al., ACL-IJCNLP 2021)
ACL