@inproceedings{abdelsalam-etal-2022-visual,
title = "Visual Semantic Parsing: From Images to {A}bstract {M}eaning {R}epresentation",
author = "Abdelsalam, Mohamed Ashraf and
Shi, Zhan and
Fancellu, Federico and
Basioti, Kalliopi and
Bhatt, Dhaivat and
Pavlovic, Vladimir and
Fazly, Afsaneh",
editor = "Fokkens, Antske and
Srikumar, Vivek",
booktitle = "Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.conll-1.19",
doi = "10.18653/v1/2022.conll-1.19",
pages = "282--300",
abstract = "The success of scene graphs for visual scene understanding has brought attention to the benefits of abstracting a visual input (e.g., image) into a structured representation, where entities (people and objects) are nodes connected by edges specifying their relations. Building these representations, however, requires expensive manual annotation in the form of images paired with their scene graphs or frames. These formalisms remain limited in the nature of entities and relations they can capture. In this paper, we propose to leverage a widely-used meaning representation in the field of natural language processing, the Abstract Meaning Representation (AMR), to address these shortcomings. Compared to scene graphs, which largely emphasize spatial relationships, our visual AMR graphs are more linguistically informed, with a focus on higher-level semantic concepts extrapolated from visual input. Moreover, they allow us to generate meta-AMR graphs to unify information contained in multiple image descriptions under one representation. Through extensive experimentation and analysis, we demonstrate that we can re-purpose an existing text-to-AMR parser to parse images into AMRs. Our findings point to important future research directions for improved scene understanding.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdelsalam-etal-2022-visual">
<titleInfo>
<title>Visual Semantic Parsing: From Images to Abstract Meaning Representation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="given">Ashraf</namePart>
<namePart type="family">Abdelsalam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhan</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Fancellu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalliopi</namePart>
<namePart type="family">Basioti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhaivat</namePart>
<namePart type="family">Bhatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladimir</namePart>
<namePart type="family">Pavlovic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afsaneh</namePart>
<namePart type="family">Fazly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antske</namePart>
<namePart type="family">Fokkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The success of scene graphs for visual scene understanding has brought attention to the benefits of abstracting a visual input (e.g., image) into a structured representation, where entities (people and objects) are nodes connected by edges specifying their relations. Building these representations, however, requires expensive manual annotation in the form of images paired with their scene graphs or frames. These formalisms remain limited in the nature of entities and relations they can capture. In this paper, we propose to leverage a widely-used meaning representation in the field of natural language processing, the Abstract Meaning Representation (AMR), to address these shortcomings. Compared to scene graphs, which largely emphasize spatial relationships, our visual AMR graphs are more linguistically informed, with a focus on higher-level semantic concepts extrapolated from visual input. Moreover, they allow us to generate meta-AMR graphs to unify information contained in multiple image descriptions under one representation. Through extensive experimentation and analysis, we demonstrate that we can re-purpose an existing text-to-AMR parser to parse images into AMRs. Our findings point to important future research directions for improved scene understanding.</abstract>
<identifier type="citekey">abdelsalam-etal-2022-visual</identifier>
<identifier type="doi">10.18653/v1/2022.conll-1.19</identifier>
<location>
<url>https://aclanthology.org/2022.conll-1.19</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>282</start>
<end>300</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visual Semantic Parsing: From Images to Abstract Meaning Representation
%A Abdelsalam, Mohamed Ashraf
%A Shi, Zhan
%A Fancellu, Federico
%A Basioti, Kalliopi
%A Bhatt, Dhaivat
%A Pavlovic, Vladimir
%A Fazly, Afsaneh
%Y Fokkens, Antske
%Y Srikumar, Vivek
%S Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F abdelsalam-etal-2022-visual
%X The success of scene graphs for visual scene understanding has brought attention to the benefits of abstracting a visual input (e.g., image) into a structured representation, where entities (people and objects) are nodes connected by edges specifying their relations. Building these representations, however, requires expensive manual annotation in the form of images paired with their scene graphs or frames. These formalisms remain limited in the nature of entities and relations they can capture. In this paper, we propose to leverage a widely-used meaning representation in the field of natural language processing, the Abstract Meaning Representation (AMR), to address these shortcomings. Compared to scene graphs, which largely emphasize spatial relationships, our visual AMR graphs are more linguistically informed, with a focus on higher-level semantic concepts extrapolated from visual input. Moreover, they allow us to generate meta-AMR graphs to unify information contained in multiple image descriptions under one representation. Through extensive experimentation and analysis, we demonstrate that we can re-purpose an existing text-to-AMR parser to parse images into AMRs. Our findings point to important future research directions for improved scene understanding.
%R 10.18653/v1/2022.conll-1.19
%U https://aclanthology.org/2022.conll-1.19
%U https://doi.org/10.18653/v1/2022.conll-1.19
%P 282-300
Markdown (Informal)
[Visual Semantic Parsing: From Images to Abstract Meaning Representation](https://aclanthology.org/2022.conll-1.19) (Abdelsalam et al., CoNLL 2022)
ACL
- Mohamed Ashraf Abdelsalam, Zhan Shi, Federico Fancellu, Kalliopi Basioti, Dhaivat Bhatt, Vladimir Pavlovic, and Afsaneh Fazly. 2022. Visual Semantic Parsing: From Images to Abstract Meaning Representation. In Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL), pages 282–300, Abu Dhabi, United Arab Emirates (Hybrid). Association for Computational Linguistics.