@inproceedings{pantazopoulos-etal-2022-combine,
title = "Combine to Describe: Evaluating Compositional Generalization in Image Captioning",
author = "Pantazopoulos, Georgios and
Suglia, Alessandro and
Eshghi, Arash",
editor = "Louvan, Samuel and
Madotto, Andrea and
Madureira, Brielen",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-srw.11",
doi = "10.18653/v1/2022.acl-srw.11",
pages = "115--131",
abstract = "Compositionality {--} the ability to combine simpler concepts to understand {\&} generate arbitrarily more complex conceptual structures {--} has long been thought to be the cornerstone of human language capacity. With the recent, notable success of neural models in various NLP tasks, attention has now naturally turned to the compositional capacity of these models. In this paper, we study the compositional generalization properties of image captioning models. We perform a set experiments under controlled conditions using model and data ablations, each designed to benchmark a particular facet of compositional generalization: systematicity is the ability of a model to create novel combinations of concepts out of those observed during training, productivity is here operationalised as the capacity of a model to extend its predictions beyond the length distribution it has observed during training, and substitutivity is concerned with the robustness of the model against synonym substitutions. While previous work has focused primarily on systematicity, here we provide a more in-depth analysis of the strengths and weaknesses of state of the art captioning models. Our findings demonstrate that the models we study here do not compositionally generalize in terms of systematicity and productivity, however, they are robust to some degree to synonym substitutions",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pantazopoulos-etal-2022-combine">
<titleInfo>
<title>Combine to Describe: Evaluating Compositional Generalization in Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Pantazopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Suglia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arash</namePart>
<namePart type="family">Eshghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Louvan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Madotto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Compositionality – the ability to combine simpler concepts to understand & generate arbitrarily more complex conceptual structures – has long been thought to be the cornerstone of human language capacity. With the recent, notable success of neural models in various NLP tasks, attention has now naturally turned to the compositional capacity of these models. In this paper, we study the compositional generalization properties of image captioning models. We perform a set experiments under controlled conditions using model and data ablations, each designed to benchmark a particular facet of compositional generalization: systematicity is the ability of a model to create novel combinations of concepts out of those observed during training, productivity is here operationalised as the capacity of a model to extend its predictions beyond the length distribution it has observed during training, and substitutivity is concerned with the robustness of the model against synonym substitutions. While previous work has focused primarily on systematicity, here we provide a more in-depth analysis of the strengths and weaknesses of state of the art captioning models. Our findings demonstrate that the models we study here do not compositionally generalize in terms of systematicity and productivity, however, they are robust to some degree to synonym substitutions</abstract>
<identifier type="citekey">pantazopoulos-etal-2022-combine</identifier>
<identifier type="doi">10.18653/v1/2022.acl-srw.11</identifier>
<location>
<url>https://aclanthology.org/2022.acl-srw.11</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>115</start>
<end>131</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combine to Describe: Evaluating Compositional Generalization in Image Captioning
%A Pantazopoulos, Georgios
%A Suglia, Alessandro
%A Eshghi, Arash
%Y Louvan, Samuel
%Y Madotto, Andrea
%Y Madureira, Brielen
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F pantazopoulos-etal-2022-combine
%X Compositionality – the ability to combine simpler concepts to understand & generate arbitrarily more complex conceptual structures – has long been thought to be the cornerstone of human language capacity. With the recent, notable success of neural models in various NLP tasks, attention has now naturally turned to the compositional capacity of these models. In this paper, we study the compositional generalization properties of image captioning models. We perform a set experiments under controlled conditions using model and data ablations, each designed to benchmark a particular facet of compositional generalization: systematicity is the ability of a model to create novel combinations of concepts out of those observed during training, productivity is here operationalised as the capacity of a model to extend its predictions beyond the length distribution it has observed during training, and substitutivity is concerned with the robustness of the model against synonym substitutions. While previous work has focused primarily on systematicity, here we provide a more in-depth analysis of the strengths and weaknesses of state of the art captioning models. Our findings demonstrate that the models we study here do not compositionally generalize in terms of systematicity and productivity, however, they are robust to some degree to synonym substitutions
%R 10.18653/v1/2022.acl-srw.11
%U https://aclanthology.org/2022.acl-srw.11
%U https://doi.org/10.18653/v1/2022.acl-srw.11
%P 115-131
Markdown (Informal)
[Combine to Describe: Evaluating Compositional Generalization in Image Captioning](https://aclanthology.org/2022.acl-srw.11) (Pantazopoulos et al., ACL 2022)
ACL