@inproceedings{cafagna-etal-2023-hl,
title = "{HL} Dataset: Visually-grounded Description of Scenes, Actions and Rationales",
author = "Cafagna, Michele and
van Deemter, Kees and
Gatt, Albert",
editor = "Keet, C. Maria and
Lee, Hung-Yi and
Zarrie{\ss}, Sina",
booktitle = "Proceedings of the 16th International Natural Language Generation Conference",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.inlg-main.21/",
doi = "10.18653/v1/2023.inlg-main.21",
pages = "293--312",
abstract = "Current captioning datasets focus on object-centric captions, describing the visible objects in the image, often ending up stating the obvious (for humans), e.g. {\textquotedblleft}people eating food in a park{\textquotedblright}. Although these datasets are useful to evaluate the ability of Vision {\&} Language models to recognize and describe visual content, they do not support controlled experiments involving model testing or fine-tuning, with more high-level captions, which humans find easy and natural to produce. For example, people often describe images based on the type of scene they depict ({\textquotedblleft}people at a holiday resort{\textquotedblright}) and the actions they perform ({\textquotedblleft}people having a picnic{\textquotedblright}). Such concepts are based on personal experience and contribute to forming common sense assumptions. We present the High-Level Dataset, a dataset extending 14997 images from the COCO dataset, aligned with a new set of 134,973 human-annotated (high-level) captions collected along three axes: scenes, actions and rationales. We further extend this dataset with confidence scores collected from an independent set of readers, as well as a set of narrative captions generated synthetically, by combining each of the three axes. We describe this dataset and analyse it extensively. We also present baseline results for the High-Level Captioning task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cafagna-etal-2023-hl">
<titleInfo>
<title>HL Dataset: Visually-grounded Description of Scenes, Actions and Rationales</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michele</namePart>
<namePart type="family">Cafagna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">C</namePart>
<namePart type="given">Maria</namePart>
<namePart type="family">Keet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-Yi</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Zarrieß</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czechia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current captioning datasets focus on object-centric captions, describing the visible objects in the image, often ending up stating the obvious (for humans), e.g. “people eating food in a park”. Although these datasets are useful to evaluate the ability of Vision & Language models to recognize and describe visual content, they do not support controlled experiments involving model testing or fine-tuning, with more high-level captions, which humans find easy and natural to produce. For example, people often describe images based on the type of scene they depict (“people at a holiday resort”) and the actions they perform (“people having a picnic”). Such concepts are based on personal experience and contribute to forming common sense assumptions. We present the High-Level Dataset, a dataset extending 14997 images from the COCO dataset, aligned with a new set of 134,973 human-annotated (high-level) captions collected along three axes: scenes, actions and rationales. We further extend this dataset with confidence scores collected from an independent set of readers, as well as a set of narrative captions generated synthetically, by combining each of the three axes. We describe this dataset and analyse it extensively. We also present baseline results for the High-Level Captioning task.</abstract>
<identifier type="citekey">cafagna-etal-2023-hl</identifier>
<identifier type="doi">10.18653/v1/2023.inlg-main.21</identifier>
<location>
<url>https://aclanthology.org/2023.inlg-main.21/</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>293</start>
<end>312</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HL Dataset: Visually-grounded Description of Scenes, Actions and Rationales
%A Cafagna, Michele
%A van Deemter, Kees
%A Gatt, Albert
%Y Keet, C. Maria
%Y Lee, Hung-Yi
%Y Zarrieß, Sina
%S Proceedings of the 16th International Natural Language Generation Conference
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czechia
%F cafagna-etal-2023-hl
%X Current captioning datasets focus on object-centric captions, describing the visible objects in the image, often ending up stating the obvious (for humans), e.g. “people eating food in a park”. Although these datasets are useful to evaluate the ability of Vision & Language models to recognize and describe visual content, they do not support controlled experiments involving model testing or fine-tuning, with more high-level captions, which humans find easy and natural to produce. For example, people often describe images based on the type of scene they depict (“people at a holiday resort”) and the actions they perform (“people having a picnic”). Such concepts are based on personal experience and contribute to forming common sense assumptions. We present the High-Level Dataset, a dataset extending 14997 images from the COCO dataset, aligned with a new set of 134,973 human-annotated (high-level) captions collected along three axes: scenes, actions and rationales. We further extend this dataset with confidence scores collected from an independent set of readers, as well as a set of narrative captions generated synthetically, by combining each of the three axes. We describe this dataset and analyse it extensively. We also present baseline results for the High-Level Captioning task.
%R 10.18653/v1/2023.inlg-main.21
%U https://aclanthology.org/2023.inlg-main.21/
%U https://doi.org/10.18653/v1/2023.inlg-main.21
%P 293-312
Markdown (Informal)
[HL Dataset: Visually-grounded Description of Scenes, Actions and Rationales](https://aclanthology.org/2023.inlg-main.21/) (Cafagna et al., INLG-SIGDIAL 2023)
ACL