@inproceedings{silberer-pinkal-2018-grounding,
title = "Grounding Semantic Roles in Images",
author = "Silberer, Carina and
Pinkal, Manfred",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D18-1282/",
doi = "10.18653/v1/D18-1282",
pages = "2616--2626",
abstract = "We address the task of visual semantic role labeling (vSRL), the identification of the participants of a situation or event in a visual scene, and their labeling with their semantic relations to the event or situation. We render candidate participants as image regions of objects, and train a model which learns to ground roles in the regions which depict the corresponding participant. Experimental results demonstrate that we can train a vSRL model without reliance on prohibitive image-based role annotations, by utilizing noisy data which we extract automatically from image captions using a linguistic SRL system. Furthermore, our model induces frame{---}semantic visual representations, and their comparison to previous work on supervised visual verb sense disambiguation yields overall better results."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="silberer-pinkal-2018-grounding">
<titleInfo>
<title>Grounding Semantic Roles in Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carina</namePart>
<namePart type="family">Silberer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manfred</namePart>
<namePart type="family">Pinkal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-oct-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ellen</namePart>
<namePart type="family">Riloff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Hockenmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun’ichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We address the task of visual semantic role labeling (vSRL), the identification of the participants of a situation or event in a visual scene, and their labeling with their semantic relations to the event or situation. We render candidate participants as image regions of objects, and train a model which learns to ground roles in the regions which depict the corresponding participant. Experimental results demonstrate that we can train a vSRL model without reliance on prohibitive image-based role annotations, by utilizing noisy data which we extract automatically from image captions using a linguistic SRL system. Furthermore, our model induces frame—semantic visual representations, and their comparison to previous work on supervised visual verb sense disambiguation yields overall better results.</abstract>
<identifier type="citekey">silberer-pinkal-2018-grounding</identifier>
<identifier type="doi">10.18653/v1/D18-1282</identifier>
<location>
<url>https://aclanthology.org/D18-1282/</url>
</location>
<part>
<date>2018-oct-nov</date>
<extent unit="page">
<start>2616</start>
<end>2626</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Grounding Semantic Roles in Images
%A Silberer, Carina
%A Pinkal, Manfred
%Y Riloff, Ellen
%Y Chiang, David
%Y Hockenmaier, Julia
%Y Tsujii, Jun’ichi
%S Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing
%D 2018
%8 oct nov
%I Association for Computational Linguistics
%C Brussels, Belgium
%F silberer-pinkal-2018-grounding
%X We address the task of visual semantic role labeling (vSRL), the identification of the participants of a situation or event in a visual scene, and their labeling with their semantic relations to the event or situation. We render candidate participants as image regions of objects, and train a model which learns to ground roles in the regions which depict the corresponding participant. Experimental results demonstrate that we can train a vSRL model without reliance on prohibitive image-based role annotations, by utilizing noisy data which we extract automatically from image captions using a linguistic SRL system. Furthermore, our model induces frame—semantic visual representations, and their comparison to previous work on supervised visual verb sense disambiguation yields overall better results.
%R 10.18653/v1/D18-1282
%U https://aclanthology.org/D18-1282/
%U https://doi.org/10.18653/v1/D18-1282
%P 2616-2626
Markdown (Informal)
[Grounding Semantic Roles in Images](https://aclanthology.org/D18-1282/) (Silberer & Pinkal, EMNLP 2018)
ACL
- Carina Silberer and Manfred Pinkal. 2018. Grounding Semantic Roles in Images. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 2616–2626, Brussels, Belgium. Association for Computational Linguistics.