@inproceedings{muraoka-etal-2020-image,
title = "Image Position Prediction in Multimodal Documents",
author = "Muraoka, Masayasu and
Kohita, Ryosuke and
Ishii, Etsuko",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.526",
pages = "4265--4274",
abstract = "Conventional multimodal tasks, such as caption generation and visual question answering, have allowed machines to understand an image by describing or being asked about it in natural language, often via a sentence. Datasets for these tasks contain a large number of pairs of an image and the corresponding sentence as an instance. However, a real multimodal document such as a news article or Wikipedia page consists of multiple sentences with multiple images. Such documents require an advanced skill of jointly considering the multiple texts and multiple images, beyond a single sentence and image, for the interpretation. Therefore, aiming at building a system that can understand multimodal documents, we propose a task called image position prediction (IPP). In this task, a system learns plausible positions of images in a given document. To study this task, we automatically constructed a dataset of 66K multimodal documents with 320K images from Wikipedia articles. We conducted a preliminary experiment to evaluate the performance of a current multimodal system on our task. The experimental results show that the system outperformed simple baselines while the performance is still far from human performance, which thus poses new challenges in multimodal research.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="muraoka-etal-2020-image">
<titleInfo>
<title>Image Position Prediction in Multimodal Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masayasu</namePart>
<namePart type="family">Muraoka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryosuke</namePart>
<namePart type="family">Kohita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Etsuko</namePart>
<namePart type="family">Ishii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Conventional multimodal tasks, such as caption generation and visual question answering, have allowed machines to understand an image by describing or being asked about it in natural language, often via a sentence. Datasets for these tasks contain a large number of pairs of an image and the corresponding sentence as an instance. However, a real multimodal document such as a news article or Wikipedia page consists of multiple sentences with multiple images. Such documents require an advanced skill of jointly considering the multiple texts and multiple images, beyond a single sentence and image, for the interpretation. Therefore, aiming at building a system that can understand multimodal documents, we propose a task called image position prediction (IPP). In this task, a system learns plausible positions of images in a given document. To study this task, we automatically constructed a dataset of 66K multimodal documents with 320K images from Wikipedia articles. We conducted a preliminary experiment to evaluate the performance of a current multimodal system on our task. The experimental results show that the system outperformed simple baselines while the performance is still far from human performance, which thus poses new challenges in multimodal research.</abstract>
<identifier type="citekey">muraoka-etal-2020-image</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.526</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>4265</start>
<end>4274</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Image Position Prediction in Multimodal Documents
%A Muraoka, Masayasu
%A Kohita, Ryosuke
%A Ishii, Etsuko
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F muraoka-etal-2020-image
%X Conventional multimodal tasks, such as caption generation and visual question answering, have allowed machines to understand an image by describing or being asked about it in natural language, often via a sentence. Datasets for these tasks contain a large number of pairs of an image and the corresponding sentence as an instance. However, a real multimodal document such as a news article or Wikipedia page consists of multiple sentences with multiple images. Such documents require an advanced skill of jointly considering the multiple texts and multiple images, beyond a single sentence and image, for the interpretation. Therefore, aiming at building a system that can understand multimodal documents, we propose a task called image position prediction (IPP). In this task, a system learns plausible positions of images in a given document. To study this task, we automatically constructed a dataset of 66K multimodal documents with 320K images from Wikipedia articles. We conducted a preliminary experiment to evaluate the performance of a current multimodal system on our task. The experimental results show that the system outperformed simple baselines while the performance is still far from human performance, which thus poses new challenges in multimodal research.
%U https://aclanthology.org/2020.lrec-1.526
%P 4265-4274
Markdown (Informal)
[Image Position Prediction in Multimodal Documents](https://aclanthology.org/2020.lrec-1.526) (Muraoka et al., LREC 2020)
ACL
- Masayasu Muraoka, Ryosuke Kohita, and Etsuko Ishii. 2020. Image Position Prediction in Multimodal Documents. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 4265–4274, Marseille, France. European Language Resources Association.