@inproceedings{ghanimifard-dobnik-2019-goes,
title = "What goes into a word: generating image descriptions with top-down spatial knowledge",
author = "Ghanimifard, Mehdi and
Dobnik, Simon",
editor = "van Deemter, Kees and
Lin, Chenghua and
Takamura, Hiroya",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
month = oct # "{--}" # nov,
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8668",
doi = "10.18653/v1/W19-8668",
pages = "540--551",
abstract = "Generating grounded image descriptions requires associating linguistic units with their corresponding visual clues. A common method is to train a decoder language model with attention mechanism over convolutional visual features. Attention weights align the stratified visual features arranged by their location with tokens, most commonly words, in the target description. However, words such as spatial relations (e.g. \textit{next to} and \textit{under}) are not directly referring to geometric arrangements of pixels but to complex geometric and conceptual representations. The aim of this paper is to evaluate what representations facilitate generating image descriptions with spatial relations and lead to better grounded language generation. In particular, we investigate the contribution of three different representational modalities in generating relational referring expressions: (i) pre-trained convolutional visual features, (ii) different top-down geometric relational knowledge between objects, and (iii) world knowledge captured by contextual embeddings in language models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghanimifard-dobnik-2019-goes">
<titleInfo>
<title>What goes into a word: generating image descriptions with top-down spatial knowledge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Ghanimifard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-oct–nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generating grounded image descriptions requires associating linguistic units with their corresponding visual clues. A common method is to train a decoder language model with attention mechanism over convolutional visual features. Attention weights align the stratified visual features arranged by their location with tokens, most commonly words, in the target description. However, words such as spatial relations (e.g. next to and under) are not directly referring to geometric arrangements of pixels but to complex geometric and conceptual representations. The aim of this paper is to evaluate what representations facilitate generating image descriptions with spatial relations and lead to better grounded language generation. In particular, we investigate the contribution of three different representational modalities in generating relational referring expressions: (i) pre-trained convolutional visual features, (ii) different top-down geometric relational knowledge between objects, and (iii) world knowledge captured by contextual embeddings in language models.</abstract>
<identifier type="citekey">ghanimifard-dobnik-2019-goes</identifier>
<identifier type="doi">10.18653/v1/W19-8668</identifier>
<location>
<url>https://aclanthology.org/W19-8668</url>
</location>
<part>
<date>2019-oct–nov</date>
<extent unit="page">
<start>540</start>
<end>551</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What goes into a word: generating image descriptions with top-down spatial knowledge
%A Ghanimifard, Mehdi
%A Dobnik, Simon
%Y van Deemter, Kees
%Y Lin, Chenghua
%Y Takamura, Hiroya
%S Proceedings of the 12th International Conference on Natural Language Generation
%D 2019
%8 oct–nov
%I Association for Computational Linguistics
%C Tokyo, Japan
%F ghanimifard-dobnik-2019-goes
%X Generating grounded image descriptions requires associating linguistic units with their corresponding visual clues. A common method is to train a decoder language model with attention mechanism over convolutional visual features. Attention weights align the stratified visual features arranged by their location with tokens, most commonly words, in the target description. However, words such as spatial relations (e.g. next to and under) are not directly referring to geometric arrangements of pixels but to complex geometric and conceptual representations. The aim of this paper is to evaluate what representations facilitate generating image descriptions with spatial relations and lead to better grounded language generation. In particular, we investigate the contribution of three different representational modalities in generating relational referring expressions: (i) pre-trained convolutional visual features, (ii) different top-down geometric relational knowledge between objects, and (iii) world knowledge captured by contextual embeddings in language models.
%R 10.18653/v1/W19-8668
%U https://aclanthology.org/W19-8668
%U https://doi.org/10.18653/v1/W19-8668
%P 540-551
Markdown (Informal)
[What goes into a word: generating image descriptions with top-down spatial knowledge](https://aclanthology.org/W19-8668) (Ghanimifard & Dobnik, INLG 2019)
ACL