@inproceedings{utescher-zarriess-2021-castle,
title = "What Did This Castle Look like before? Exploring Referential Relations in Naturally Occurring Multimodal Texts",
author = "Utescher, Ronja and
Zarrie{\ss}, Sina",
editor = "Mosbach, Marius and
Hedderich, Michael A. and
Pezzelle, Sandro and
Mogadala, Aditya and
Klakow, Dietrich and
Moens, Marie-Francine and
Akata, Zeynep",
booktitle = "Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)",
month = apr,
year = "2021",
address = "Kyiv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.lantern-1.5",
pages = "53--60",
abstract = "Multi-modal texts are abundant and diverse in structure, yet Language {\&} Vision research of these naturally occurring texts has mostly focused on genres that are comparatively light on text, like tweets. In this paper, we discuss the challenges and potential benefits of a L{\&}V framework that explicitly models referential relations, taking Wikipedia articles about buildings as an example. We briefly survey existing related tasks in L{\&}V and propose multi-modal information extraction as a general direction for future research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="utescher-zarriess-2021-castle">
<titleInfo>
<title>What Did This Castle Look like before? Exploring Referential Relations in Naturally Occurring Multimodal Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Utescher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Zarrieß</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Mosbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Hedderich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Mogadala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dietrich</namePart>
<namePart type="family">Klakow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeynep</namePart>
<namePart type="family">Akata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multi-modal texts are abundant and diverse in structure, yet Language & Vision research of these naturally occurring texts has mostly focused on genres that are comparatively light on text, like tweets. In this paper, we discuss the challenges and potential benefits of a L&V framework that explicitly models referential relations, taking Wikipedia articles about buildings as an example. We briefly survey existing related tasks in L&V and propose multi-modal information extraction as a general direction for future research.</abstract>
<identifier type="citekey">utescher-zarriess-2021-castle</identifier>
<location>
<url>https://aclanthology.org/2021.lantern-1.5</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>53</start>
<end>60</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What Did This Castle Look like before? Exploring Referential Relations in Naturally Occurring Multimodal Texts
%A Utescher, Ronja
%A Zarrieß, Sina
%Y Mosbach, Marius
%Y Hedderich, Michael A.
%Y Pezzelle, Sandro
%Y Mogadala, Aditya
%Y Klakow, Dietrich
%Y Moens, Marie-Francine
%Y Akata, Zeynep
%S Proceedings of the Third Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv, Ukraine
%F utescher-zarriess-2021-castle
%X Multi-modal texts are abundant and diverse in structure, yet Language & Vision research of these naturally occurring texts has mostly focused on genres that are comparatively light on text, like tweets. In this paper, we discuss the challenges and potential benefits of a L&V framework that explicitly models referential relations, taking Wikipedia articles about buildings as an example. We briefly survey existing related tasks in L&V and propose multi-modal information extraction as a general direction for future research.
%U https://aclanthology.org/2021.lantern-1.5
%P 53-60
Markdown (Informal)
[What Did This Castle Look like before? Exploring Referential Relations in Naturally Occurring Multimodal Texts](https://aclanthology.org/2021.lantern-1.5) (Utescher & Zarrieß, LANTERN 2021)
ACL