@inproceedings{ramos-etal-2024-paella,
title = "{PAELLA}: Parameter-Efficient Lightweight Language-Agnostic Captioning Model",
author = "Ramos, Rita and
Bugliarello, Emanuele and
Martins, Bruno and
Elliott, Desmond",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.225",
doi = "10.18653/v1/2024.findings-naacl.225",
pages = "3549--3564",
abstract = "We introduce PAELLA, a Parameter-Efficient Lightweight Language-Agnostic image captioning model designed to be both parameter and data-efficient using retrieval augmentation. The model is trained by learning a small mapping network with 34M parameters between a pre-trained visual model and a multilingual language model that is conditioned on two types of input: (i) the image itself, and (ii) a set of retrieved captions in the target language. The retrieved examples play a key role in guiding the model to generate captions across languages. Through retrieval, the model can be lightweight in terms of the number of trainable parameters, which only exist in its mapping network, and also in the amount of multilingual training data that is required. Experiments on the XM3600 dataset, featuring 36 languages, show that PAELLA can outperform or compete against some models with 3{--}77$\times$ more learned parameters and 35{--}863$\times$ more data, particularly in low-resource languages. We also find that PAELLA can be trained on only monolingual data and still show strong zero-shot abilities in other languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ramos-etal-2024-paella">
<titleInfo>
<title>PAELLA: Parameter-Efficient Lightweight Language-Agnostic Captioning Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rita</namePart>
<namePart type="family">Ramos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emanuele</namePart>
<namePart type="family">Bugliarello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bruno</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce PAELLA, a Parameter-Efficient Lightweight Language-Agnostic image captioning model designed to be both parameter and data-efficient using retrieval augmentation. The model is trained by learning a small mapping network with 34M parameters between a pre-trained visual model and a multilingual language model that is conditioned on two types of input: (i) the image itself, and (ii) a set of retrieved captions in the target language. The retrieved examples play a key role in guiding the model to generate captions across languages. Through retrieval, the model can be lightweight in terms of the number of trainable parameters, which only exist in its mapping network, and also in the amount of multilingual training data that is required. Experiments on the XM3600 dataset, featuring 36 languages, show that PAELLA can outperform or compete against some models with 3–77\times more learned parameters and 35–863\times more data, particularly in low-resource languages. We also find that PAELLA can be trained on only monolingual data and still show strong zero-shot abilities in other languages.</abstract>
<identifier type="citekey">ramos-etal-2024-paella</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.225</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.225</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3549</start>
<end>3564</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PAELLA: Parameter-Efficient Lightweight Language-Agnostic Captioning Model
%A Ramos, Rita
%A Bugliarello, Emanuele
%A Martins, Bruno
%A Elliott, Desmond
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F ramos-etal-2024-paella
%X We introduce PAELLA, a Parameter-Efficient Lightweight Language-Agnostic image captioning model designed to be both parameter and data-efficient using retrieval augmentation. The model is trained by learning a small mapping network with 34M parameters between a pre-trained visual model and a multilingual language model that is conditioned on two types of input: (i) the image itself, and (ii) a set of retrieved captions in the target language. The retrieved examples play a key role in guiding the model to generate captions across languages. Through retrieval, the model can be lightweight in terms of the number of trainable parameters, which only exist in its mapping network, and also in the amount of multilingual training data that is required. Experiments on the XM3600 dataset, featuring 36 languages, show that PAELLA can outperform or compete against some models with 3–77\times more learned parameters and 35–863\times more data, particularly in low-resource languages. We also find that PAELLA can be trained on only monolingual data and still show strong zero-shot abilities in other languages.
%R 10.18653/v1/2024.findings-naacl.225
%U https://aclanthology.org/2024.findings-naacl.225
%U https://doi.org/10.18653/v1/2024.findings-naacl.225
%P 3549-3564
Markdown (Informal)
[PAELLA: Parameter-Efficient Lightweight Language-Agnostic Captioning Model](https://aclanthology.org/2024.findings-naacl.225) (Ramos et al., Findings 2024)
ACL