@inproceedings{nagasawa-etal-2021-validity,
title = "Validity-Based Sampling and Smoothing Methods for Multiple Reference Image Captioning",
author = "Nagasawa, Shunta and
Watanabe, Yotaro and
Iyatomi, Hitoshi",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Ross, Candace and
Salakhutdinov, Ruslan and
Poria, Soujanya and
Cambria, Erik and
Shi, Kelly",
booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence",
month = jun,
year = "2021",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.maiworkshop-1.6/",
doi = "10.18653/v1/2021.maiworkshop-1.6",
pages = "36--41",
abstract = "In image captioning, multiple captions are often provided as ground truths, since a valid caption is not always uniquely determined. Conventional methods randomly select a single caption and treat it as correct, but there have been few effective training methods that utilize multiple given captions. In this paper, we proposed two training technique for making effective use of multiple reference captions: 1) validity-based caption sampling (VBCS), which prioritizes the use of captions that are estimated to be highly valid during training, and 2) weighted caption smoothing (WCS), which applies smoothing only to the relevant words the reference caption to reflect multiple reference captions simultaneously. Experiments show that our proposed methods improve CIDEr by 2.6 points and BLEU4 by 0.9 points from baseline on the MSCOCO dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nagasawa-etal-2021-validity">
<titleInfo>
<title>Validity-Based Sampling and Smoothing Methods for Multiple Reference Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shunta</namePart>
<namePart type="family">Nagasawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotaro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Iyatomi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Multimodal Artificial Intelligence</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="given">Pu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Salakhutdinov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soujanya</namePart>
<namePart type="family">Poria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Cambria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In image captioning, multiple captions are often provided as ground truths, since a valid caption is not always uniquely determined. Conventional methods randomly select a single caption and treat it as correct, but there have been few effective training methods that utilize multiple given captions. In this paper, we proposed two training technique for making effective use of multiple reference captions: 1) validity-based caption sampling (VBCS), which prioritizes the use of captions that are estimated to be highly valid during training, and 2) weighted caption smoothing (WCS), which applies smoothing only to the relevant words the reference caption to reflect multiple reference captions simultaneously. Experiments show that our proposed methods improve CIDEr by 2.6 points and BLEU4 by 0.9 points from baseline on the MSCOCO dataset.</abstract>
<identifier type="citekey">nagasawa-etal-2021-validity</identifier>
<identifier type="doi">10.18653/v1/2021.maiworkshop-1.6</identifier>
<location>
<url>https://aclanthology.org/2021.maiworkshop-1.6/</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>36</start>
<end>41</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Validity-Based Sampling and Smoothing Methods for Multiple Reference Image Captioning
%A Nagasawa, Shunta
%A Watanabe, Yotaro
%A Iyatomi, Hitoshi
%Y Zadeh, Amir
%Y Morency, Louis-Philippe
%Y Liang, Paul Pu
%Y Ross, Candace
%Y Salakhutdinov, Ruslan
%Y Poria, Soujanya
%Y Cambria, Erik
%Y Shi, Kelly
%S Proceedings of the Third Workshop on Multimodal Artificial Intelligence
%D 2021
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nagasawa-etal-2021-validity
%X In image captioning, multiple captions are often provided as ground truths, since a valid caption is not always uniquely determined. Conventional methods randomly select a single caption and treat it as correct, but there have been few effective training methods that utilize multiple given captions. In this paper, we proposed two training technique for making effective use of multiple reference captions: 1) validity-based caption sampling (VBCS), which prioritizes the use of captions that are estimated to be highly valid during training, and 2) weighted caption smoothing (WCS), which applies smoothing only to the relevant words the reference caption to reflect multiple reference captions simultaneously. Experiments show that our proposed methods improve CIDEr by 2.6 points and BLEU4 by 0.9 points from baseline on the MSCOCO dataset.
%R 10.18653/v1/2021.maiworkshop-1.6
%U https://aclanthology.org/2021.maiworkshop-1.6/
%U https://doi.org/10.18653/v1/2021.maiworkshop-1.6
%P 36-41
Markdown (Informal)
[Validity-Based Sampling and Smoothing Methods for Multiple Reference Image Captioning](https://aclanthology.org/2021.maiworkshop-1.6/) (Nagasawa et al., maiworkshop 2021)
ACL