@inproceedings{wang-etal-2019-role,
title = "On the Role of Scene Graphs in Image Captioning",
author = "Wang, Dalin and
Beck, Daniel and
Cohn, Trevor",
editor = "Mogadala, Aditya and
Klakow, Dietrich and
Pezzelle, Sandro and
Moens, Marie-Francine",
booktitle = "Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-6405",
doi = "10.18653/v1/D19-6405",
pages = "29--34",
abstract = "Scene graphs represent semantic information in images, which can help image captioning system to produce more descriptive outputs versus using only the image as context. Recent captioning approaches rely on ad-hoc approaches to obtain graphs for images. However, those graphs introduce noise and it is unclear the effect of parser errors on captioning accuracy. In this work, we investigate to what extent scene graphs can help image captioning. Our results show that a state-of-the-art scene graph parser can boost performance almost as much as the ground truth graphs, showing that the bottleneck currently resides more on the captioning models than on the performance of the scene graph parser.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2019-role">
<titleInfo>
<title>On the Role of Scene Graphs in Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dalin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Beck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Mogadala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Scene graphs represent semantic information in images, which can help image captioning system to produce more descriptive outputs versus using only the image as context. Recent captioning approaches rely on ad-hoc approaches to obtain graphs for images. However, those graphs introduce noise and it is unclear the effect of parser errors on captioning accuracy. In this work, we investigate to what extent scene graphs can help image captioning. Our results show that a state-of-the-art scene graph parser can boost performance almost as much as the ground truth graphs, showing that the bottleneck currently resides more on the captioning models than on the performance of the scene graph parser.</abstract>
<identifier type="citekey">wang-etal-2019-role</identifier>
<identifier type="doi">10.18653/v1/D19-6405</identifier>
<location>
<url>https://aclanthology.org/D19-6405</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>29</start>
<end>34</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Role of Scene Graphs in Image Captioning
%A Wang, Dalin
%A Beck, Daniel
%A Cohn, Trevor
%Y Mogadala, Aditya
%Y Klakow, Dietrich
%Y Pezzelle, Sandro
%Y Moens, Marie-Francine
%S Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F wang-etal-2019-role
%X Scene graphs represent semantic information in images, which can help image captioning system to produce more descriptive outputs versus using only the image as context. Recent captioning approaches rely on ad-hoc approaches to obtain graphs for images. However, those graphs introduce noise and it is unclear the effect of parser errors on captioning accuracy. In this work, we investigate to what extent scene graphs can help image captioning. Our results show that a state-of-the-art scene graph parser can boost performance almost as much as the ground truth graphs, showing that the bottleneck currently resides more on the captioning models than on the performance of the scene graph parser.
%R 10.18653/v1/D19-6405
%U https://aclanthology.org/D19-6405
%U https://doi.org/10.18653/v1/D19-6405
%P 29-34
Markdown (Informal)
[On the Role of Scene Graphs in Image Captioning](https://aclanthology.org/D19-6405) (Wang et al., 2019)
ACL
- Dalin Wang, Daniel Beck, and Trevor Cohn. 2019. On the Role of Scene Graphs in Image Captioning. In Proceedings of the Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN), pages 29–34, Hong Kong, China. Association for Computational Linguistics.