@inproceedings{ng-etal-2021-understanding,
title = "Understanding Guided Image Captioning Performance across Domains",
author = "Ng, Edwin G. and
Pang, Bo and
Sharma, Piyush and
Soricut, Radu",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.14/",
doi = "10.18653/v1/2021.conll-1.14",
pages = "183--193",
abstract = "Image captioning models generally lack the capability to take into account user interest, and usually default to global descriptions that try to balance readability, informativeness, and information overload. We present a Transformer-based model with the ability to produce captions focused on specific objects, concepts or actions in an image by providing them as guiding text to the model. Further, we evaluate the quality of these guided captions when trained on Conceptual Captions which contain 3.3M image-level captions compared to Visual Genome which contain 3.6M object-level captions. Counter-intuitively, we find that guided captions produced by the model trained on Conceptual Captions generalize better on out-of-domain data. Our human-evaluation results indicate that attempting in-the-wild guided image captioning requires access to large, unrestricted-domain training datasets, and that increased style diversity (even without increasing the number of unique tokens) is a key factor for improved performance."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ng-etal-2021-understanding">
<titleInfo>
<title>Understanding Guided Image Captioning Performance across Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Edwin</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyush</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Soricut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image captioning models generally lack the capability to take into account user interest, and usually default to global descriptions that try to balance readability, informativeness, and information overload. We present a Transformer-based model with the ability to produce captions focused on specific objects, concepts or actions in an image by providing them as guiding text to the model. Further, we evaluate the quality of these guided captions when trained on Conceptual Captions which contain 3.3M image-level captions compared to Visual Genome which contain 3.6M object-level captions. Counter-intuitively, we find that guided captions produced by the model trained on Conceptual Captions generalize better on out-of-domain data. Our human-evaluation results indicate that attempting in-the-wild guided image captioning requires access to large, unrestricted-domain training datasets, and that increased style diversity (even without increasing the number of unique tokens) is a key factor for improved performance.</abstract>
<identifier type="citekey">ng-etal-2021-understanding</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.14</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.14/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>183</start>
<end>193</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding Guided Image Captioning Performance across Domains
%A Ng, Edwin G.
%A Pang, Bo
%A Sharma, Piyush
%A Soricut, Radu
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F ng-etal-2021-understanding
%X Image captioning models generally lack the capability to take into account user interest, and usually default to global descriptions that try to balance readability, informativeness, and information overload. We present a Transformer-based model with the ability to produce captions focused on specific objects, concepts or actions in an image by providing them as guiding text to the model. Further, we evaluate the quality of these guided captions when trained on Conceptual Captions which contain 3.3M image-level captions compared to Visual Genome which contain 3.6M object-level captions. Counter-intuitively, we find that guided captions produced by the model trained on Conceptual Captions generalize better on out-of-domain data. Our human-evaluation results indicate that attempting in-the-wild guided image captioning requires access to large, unrestricted-domain training datasets, and that increased style diversity (even without increasing the number of unique tokens) is a key factor for improved performance.
%R 10.18653/v1/2021.conll-1.14
%U https://aclanthology.org/2021.conll-1.14/
%U https://doi.org/10.18653/v1/2021.conll-1.14
%P 183-193
Markdown (Informal)
[Understanding Guided Image Captioning Performance across Domains](https://aclanthology.org/2021.conll-1.14/) (Ng et al., CoNLL 2021)
ACL