@inproceedings{ren-li-2024-releasing-capacity,
title = "Releasing the Capacity of {GAN}s in Non-Autoregressive Image Captioning",
author = "Ren, Da and
Li, Qing",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1214",
pages = "13906--13918",
abstract = "Building Non-autoregressive (NAR) models in image captioning can fundamentally tackle the high inference latency of autoregressive models. However, existing NAR image captioning models are trained on maximum likelihood estimation, and suffer from their inherent multi-modality problem. Although constructing NAR models based on GANs can theoretically tackle this problem, existing GAN-based NAR models obtain poor performance when transferred to image captioning due to their incapacity of modeling complicated relations between images and text. To tackle this problem, we propose an Adversarial Non-autoregressive Transformer for Image Captioning (CaptionANT) by improving performance from two aspects: 1) modifying the model structure so as to be compatible with contrastive learning to effectively make use of unpaired samples; 2) integrating a reconstruction process to better utilize paired samples. By further combining with other effective techniques and our proposed lightweight structure, CaptionANT can better align input images and output text, and thus achieves new state-of-the-art performance for fully NAR models on the challenging MSCOCO dataset. More importantly, CaptionANT achieves a 26.72 times speedup compared to the autoregressive baseline with only 36.3{\%} the number of parameters of the existing best fully NAR model for image captioning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ren-li-2024-releasing-capacity">
<titleInfo>
<title>Releasing the Capacity of GANs in Non-Autoregressive Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Da</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building Non-autoregressive (NAR) models in image captioning can fundamentally tackle the high inference latency of autoregressive models. However, existing NAR image captioning models are trained on maximum likelihood estimation, and suffer from their inherent multi-modality problem. Although constructing NAR models based on GANs can theoretically tackle this problem, existing GAN-based NAR models obtain poor performance when transferred to image captioning due to their incapacity of modeling complicated relations between images and text. To tackle this problem, we propose an Adversarial Non-autoregressive Transformer for Image Captioning (CaptionANT) by improving performance from two aspects: 1) modifying the model structure so as to be compatible with contrastive learning to effectively make use of unpaired samples; 2) integrating a reconstruction process to better utilize paired samples. By further combining with other effective techniques and our proposed lightweight structure, CaptionANT can better align input images and output text, and thus achieves new state-of-the-art performance for fully NAR models on the challenging MSCOCO dataset. More importantly, CaptionANT achieves a 26.72 times speedup compared to the autoregressive baseline with only 36.3% the number of parameters of the existing best fully NAR model for image captioning.</abstract>
<identifier type="citekey">ren-li-2024-releasing-capacity</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1214</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>13906</start>
<end>13918</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Releasing the Capacity of GANs in Non-Autoregressive Image Captioning
%A Ren, Da
%A Li, Qing
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ren-li-2024-releasing-capacity
%X Building Non-autoregressive (NAR) models in image captioning can fundamentally tackle the high inference latency of autoregressive models. However, existing NAR image captioning models are trained on maximum likelihood estimation, and suffer from their inherent multi-modality problem. Although constructing NAR models based on GANs can theoretically tackle this problem, existing GAN-based NAR models obtain poor performance when transferred to image captioning due to their incapacity of modeling complicated relations between images and text. To tackle this problem, we propose an Adversarial Non-autoregressive Transformer for Image Captioning (CaptionANT) by improving performance from two aspects: 1) modifying the model structure so as to be compatible with contrastive learning to effectively make use of unpaired samples; 2) integrating a reconstruction process to better utilize paired samples. By further combining with other effective techniques and our proposed lightweight structure, CaptionANT can better align input images and output text, and thus achieves new state-of-the-art performance for fully NAR models on the challenging MSCOCO dataset. More importantly, CaptionANT achieves a 26.72 times speedup compared to the autoregressive baseline with only 36.3% the number of parameters of the existing best fully NAR model for image captioning.
%U https://aclanthology.org/2024.lrec-main.1214
%P 13906-13918
Markdown (Informal)
[Releasing the Capacity of GANs in Non-Autoregressive Image Captioning](https://aclanthology.org/2024.lrec-main.1214) (Ren & Li, LREC-COLING 2024)
ACL