@inproceedings{surikuchi-etal-2023-groovist,
title = "{GROOV}i{ST}: A Metric for Grounding Objects in Visual Storytelling",
author = "Surikuchi, Aditya and
Pezzelle, Sandro and
Fern{\'a}ndez, Raquel",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.202",
doi = "10.18653/v1/2023.emnlp-main.202",
pages = "3331--3339",
abstract = "A proper evaluation of stories generated for a sequence of images{---}the task commonly referred to as visual storytelling{---}must consider multiple aspects, such as coherence, grammatical correctness, and visual grounding. In this work, we focus on evaluating the degree of grounding, that is, the extent to which a story is about the entities shown in the images. We analyze current metrics, both designed for this purpose and for general vision-text alignment. Given their observed shortcomings, we propose a novel evaluation tool, GROOViST, that accounts for cross-modal dependencies, \textit{temporal misalignments} (the fact that the order in which entities appear in the story and the image sequence may not match), and human intuitions on visual grounding. An additional advantage of GROOViST is its modular design, where the contribution of each component can be assessed and interpreted individually.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="surikuchi-etal-2023-groovist">
<titleInfo>
<title>GROOViST: A Metric for Grounding Objects in Visual Storytelling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Surikuchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A proper evaluation of stories generated for a sequence of images—the task commonly referred to as visual storytelling—must consider multiple aspects, such as coherence, grammatical correctness, and visual grounding. In this work, we focus on evaluating the degree of grounding, that is, the extent to which a story is about the entities shown in the images. We analyze current metrics, both designed for this purpose and for general vision-text alignment. Given their observed shortcomings, we propose a novel evaluation tool, GROOViST, that accounts for cross-modal dependencies, temporal misalignments (the fact that the order in which entities appear in the story and the image sequence may not match), and human intuitions on visual grounding. An additional advantage of GROOViST is its modular design, where the contribution of each component can be assessed and interpreted individually.</abstract>
<identifier type="citekey">surikuchi-etal-2023-groovist</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.202</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.202</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>3331</start>
<end>3339</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GROOViST: A Metric for Grounding Objects in Visual Storytelling
%A Surikuchi, Aditya
%A Pezzelle, Sandro
%A Fernández, Raquel
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F surikuchi-etal-2023-groovist
%X A proper evaluation of stories generated for a sequence of images—the task commonly referred to as visual storytelling—must consider multiple aspects, such as coherence, grammatical correctness, and visual grounding. In this work, we focus on evaluating the degree of grounding, that is, the extent to which a story is about the entities shown in the images. We analyze current metrics, both designed for this purpose and for general vision-text alignment. Given their observed shortcomings, we propose a novel evaluation tool, GROOViST, that accounts for cross-modal dependencies, temporal misalignments (the fact that the order in which entities appear in the story and the image sequence may not match), and human intuitions on visual grounding. An additional advantage of GROOViST is its modular design, where the contribution of each component can be assessed and interpreted individually.
%R 10.18653/v1/2023.emnlp-main.202
%U https://aclanthology.org/2023.emnlp-main.202
%U https://doi.org/10.18653/v1/2023.emnlp-main.202
%P 3331-3339
Markdown (Informal)
[GROOViST: A Metric for Grounding Objects in Visual Storytelling](https://aclanthology.org/2023.emnlp-main.202) (Surikuchi et al., EMNLP 2023)
ACL