@inproceedings{ilinykh-dobnik-2022-decoding,
title = "Do Decoding Algorithms Capture Discourse Structure in Multi-Modal Tasks? A Case Study of Image Paragraph Generation",
author = "Ilinykh, Nikolai and
Dobnik, Simon",
editor = "Bosselut, Antoine and
Chandu, Khyathi and
Dhole, Kaustubh and
Gangal, Varun and
Gehrmann, Sebastian and
Jernite, Yacine and
Novikova, Jekaterina and
Perez-Beltrachini, Laura",
booktitle = "Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.gem-1.45",
doi = "10.18653/v1/2022.gem-1.45",
pages = "480--493",
abstract = "This paper describes insights into how different inference algorithms structure discourse in image paragraphs. We train a multi-modal transformer and compare 11 variations of decoding algorithms. We propose to evaluate image paragraphs not only with standard automatic metrics, but also with a more extensive, {``}under the hood{''} analysis of the discourse formed by sentences. Our results show that while decoding algorithms can be unfaithful to the reference texts, they still generate grounded descriptions, but they also lack understanding of the discourse structure and differ from humans in terms of attentional structure over images.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ilinykh-dobnik-2022-decoding">
<titleInfo>
<title>Do Decoding Algorithms Capture Discourse Structure in Multi-Modal Tasks? A Case Study of Image Paragraph Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Gangal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacine</namePart>
<namePart type="family">Jernite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jekaterina</namePart>
<namePart type="family">Novikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Perez-Beltrachini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes insights into how different inference algorithms structure discourse in image paragraphs. We train a multi-modal transformer and compare 11 variations of decoding algorithms. We propose to evaluate image paragraphs not only with standard automatic metrics, but also with a more extensive, “under the hood” analysis of the discourse formed by sentences. Our results show that while decoding algorithms can be unfaithful to the reference texts, they still generate grounded descriptions, but they also lack understanding of the discourse structure and differ from humans in terms of attentional structure over images.</abstract>
<identifier type="citekey">ilinykh-dobnik-2022-decoding</identifier>
<identifier type="doi">10.18653/v1/2022.gem-1.45</identifier>
<location>
<url>https://aclanthology.org/2022.gem-1.45</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>480</start>
<end>493</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Decoding Algorithms Capture Discourse Structure in Multi-Modal Tasks? A Case Study of Image Paragraph Generation
%A Ilinykh, Nikolai
%A Dobnik, Simon
%Y Bosselut, Antoine
%Y Chandu, Khyathi
%Y Dhole, Kaustubh
%Y Gangal, Varun
%Y Gehrmann, Sebastian
%Y Jernite, Yacine
%Y Novikova, Jekaterina
%Y Perez-Beltrachini, Laura
%S Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F ilinykh-dobnik-2022-decoding
%X This paper describes insights into how different inference algorithms structure discourse in image paragraphs. We train a multi-modal transformer and compare 11 variations of decoding algorithms. We propose to evaluate image paragraphs not only with standard automatic metrics, but also with a more extensive, “under the hood” analysis of the discourse formed by sentences. Our results show that while decoding algorithms can be unfaithful to the reference texts, they still generate grounded descriptions, but they also lack understanding of the discourse structure and differ from humans in terms of attentional structure over images.
%R 10.18653/v1/2022.gem-1.45
%U https://aclanthology.org/2022.gem-1.45
%U https://doi.org/10.18653/v1/2022.gem-1.45
%P 480-493
Markdown (Informal)
[Do Decoding Algorithms Capture Discourse Structure in Multi-Modal Tasks? A Case Study of Image Paragraph Generation](https://aclanthology.org/2022.gem-1.45) (Ilinykh & Dobnik, GEM 2022)
ACL