@inproceedings{takmaz-etal-2020-generating,
title = "{G}enerating {I}mage {D}escriptions via {S}equential {C}ross-{M}odal {A}lignment {G}uided by {H}uman {G}aze",
author = "Takmaz, Ece and
Pezzelle, Sandro and
Beinborn, Lisa and
Fern{\'a}ndez, Raquel",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.377",
doi = "10.18653/v1/2020.emnlp-main.377",
pages = "4664--4677",
abstract = "When speakers describe an image, they tend to look at objects before mentioning them. In this paper, we investigate such sequential cross-modal alignment by modelling the image description generation process computationally. We take as our starting point a state-of-the-art image captioning system and develop several model variants that exploit information from human gaze patterns recorded during language production. In particular, we propose the first approach to image description generation where visual processing is modelled sequentially. Our experiments and analyses confirm that better descriptions can be obtained by exploiting gaze-driven attention and shed light on human cognitive processes by comparing different ways of aligning the gaze modality with language production. We find that processing gaze data sequentially leads to descriptions that are better aligned to those produced by speakers, more diverse, and more natural{---}particularly when gaze is encoded with a dedicated recurrent component.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takmaz-etal-2020-generating">
<titleInfo>
<title>Generating Image Descriptions via Sequential Cross-Modal Alignment Guided by Human Gaze</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ece</namePart>
<namePart type="family">Takmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lisa</namePart>
<namePart type="family">Beinborn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When speakers describe an image, they tend to look at objects before mentioning them. In this paper, we investigate such sequential cross-modal alignment by modelling the image description generation process computationally. We take as our starting point a state-of-the-art image captioning system and develop several model variants that exploit information from human gaze patterns recorded during language production. In particular, we propose the first approach to image description generation where visual processing is modelled sequentially. Our experiments and analyses confirm that better descriptions can be obtained by exploiting gaze-driven attention and shed light on human cognitive processes by comparing different ways of aligning the gaze modality with language production. We find that processing gaze data sequentially leads to descriptions that are better aligned to those produced by speakers, more diverse, and more natural—particularly when gaze is encoded with a dedicated recurrent component.</abstract>
<identifier type="citekey">takmaz-etal-2020-generating</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.377</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.377</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>4664</start>
<end>4677</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Image Descriptions via Sequential Cross-Modal Alignment Guided by Human Gaze
%A Takmaz, Ece
%A Pezzelle, Sandro
%A Beinborn, Lisa
%A Fernández, Raquel
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F takmaz-etal-2020-generating
%X When speakers describe an image, they tend to look at objects before mentioning them. In this paper, we investigate such sequential cross-modal alignment by modelling the image description generation process computationally. We take as our starting point a state-of-the-art image captioning system and develop several model variants that exploit information from human gaze patterns recorded during language production. In particular, we propose the first approach to image description generation where visual processing is modelled sequentially. Our experiments and analyses confirm that better descriptions can be obtained by exploiting gaze-driven attention and shed light on human cognitive processes by comparing different ways of aligning the gaze modality with language production. We find that processing gaze data sequentially leads to descriptions that are better aligned to those produced by speakers, more diverse, and more natural—particularly when gaze is encoded with a dedicated recurrent component.
%R 10.18653/v1/2020.emnlp-main.377
%U https://aclanthology.org/2020.emnlp-main.377
%U https://doi.org/10.18653/v1/2020.emnlp-main.377
%P 4664-4677
Markdown (Informal)
[Generating Image Descriptions via Sequential Cross-Modal Alignment Guided by Human Gaze](https://aclanthology.org/2020.emnlp-main.377) (Takmaz et al., EMNLP 2020)
ACL