@inproceedings{simplicio-etal-2024-v,
title = "{V}-{G}l{\'o}r{IA} - Customizing Large Vision and Language Models to {E}uropean {P}ortuguese",
author = "Simpl{\'\i}cio, Afonso and
Semedo, David and
Magalhaes, Joao",
editor = "Kumar, Sachin and
Balachandran, Vidhisha and
Park, Chan Young and
Shi, Weijia and
Hayati, Shirley Anugrah and
Tsvetkov, Yulia and
Smith, Noah and
Hajishirzi, Hannaneh and
Kang, Dongyeop and
Jurgens, David",
booktitle = "Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.customnlp4u-1.24",
pages = "317--326",
abstract = "Generative Vision and Language models have obtained remarkable results recently, thanks to the use of robust pre-trained Visual encoders and Large Language Models (LLMs), together with efficient model adaptation training strategies, requiring minimal architecturalmodifications, while preserving LLMs{'} original capabilities. With these advances focusing mainly on the English language, there is a gap in customization methodologies for other languages. In this paper, we propose a customization methodology that adapts existingstate-of-the-art vision and language architectures to European Portuguese (PT-PT). As a result of applying this methodology, we introduce V-Gl{\'o}rIA , the first Large Vision and Language generative model specifically customized for European Portuguese. V-Gl{\'o}rIA supports multimodal tasks such as image captioning, retrieval, and dialogue. To deliver V-Gl{\'o}rIA, we leverage state-of-the-art V{\&}L architectures, and contribute with PT-PT machine-translated pre-training (CC3M PT-PT) and benchmark (MSCOCO PT-PT and VisDial PT-PT) datasets.Our experiments show that V-Gl{\'o}rIA delivers promising performance in text-image retrieval and downstream tasks in a zero-shot setting, such as image captioning and visual dialogue tasks, highlighting the effectiveness of our customization approach.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="simplicio-etal-2024-v">
<titleInfo>
<title>V-GlórIA - Customizing Large Vision and Language Models to European Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Afonso</namePart>
<namePart type="family">Simplício</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Semedo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joao</namePart>
<namePart type="family">Magalhaes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vidhisha</namePart>
<namePart type="family">Balachandran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chan</namePart>
<namePart type="given">Young</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weijia</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shirley</namePart>
<namePart type="given">Anugrah</namePart>
<namePart type="family">Hayati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyeop</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generative Vision and Language models have obtained remarkable results recently, thanks to the use of robust pre-trained Visual encoders and Large Language Models (LLMs), together with efficient model adaptation training strategies, requiring minimal architecturalmodifications, while preserving LLMs’ original capabilities. With these advances focusing mainly on the English language, there is a gap in customization methodologies for other languages. In this paper, we propose a customization methodology that adapts existingstate-of-the-art vision and language architectures to European Portuguese (PT-PT). As a result of applying this methodology, we introduce V-GlórIA , the first Large Vision and Language generative model specifically customized for European Portuguese. V-GlórIA supports multimodal tasks such as image captioning, retrieval, and dialogue. To deliver V-GlórIA, we leverage state-of-the-art V&L architectures, and contribute with PT-PT machine-translated pre-training (CC3M PT-PT) and benchmark (MSCOCO PT-PT and VisDial PT-PT) datasets.Our experiments show that V-GlórIA delivers promising performance in text-image retrieval and downstream tasks in a zero-shot setting, such as image captioning and visual dialogue tasks, highlighting the effectiveness of our customization approach.</abstract>
<identifier type="citekey">simplicio-etal-2024-v</identifier>
<location>
<url>https://aclanthology.org/2024.customnlp4u-1.24</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>317</start>
<end>326</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T V-GlórIA - Customizing Large Vision and Language Models to European Portuguese
%A Simplício, Afonso
%A Semedo, David
%A Magalhaes, Joao
%Y Kumar, Sachin
%Y Balachandran, Vidhisha
%Y Park, Chan Young
%Y Shi, Weijia
%Y Hayati, Shirley Anugrah
%Y Tsvetkov, Yulia
%Y Smith, Noah
%Y Hajishirzi, Hannaneh
%Y Kang, Dongyeop
%Y Jurgens, David
%S Proceedings of the 1st Workshop on Customizable NLP: Progress and Challenges in Customizing NLP for a Domain, Application, Group, or Individual (CustomNLP4U)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F simplicio-etal-2024-v
%X Generative Vision and Language models have obtained remarkable results recently, thanks to the use of robust pre-trained Visual encoders and Large Language Models (LLMs), together with efficient model adaptation training strategies, requiring minimal architecturalmodifications, while preserving LLMs’ original capabilities. With these advances focusing mainly on the English language, there is a gap in customization methodologies for other languages. In this paper, we propose a customization methodology that adapts existingstate-of-the-art vision and language architectures to European Portuguese (PT-PT). As a result of applying this methodology, we introduce V-GlórIA , the first Large Vision and Language generative model specifically customized for European Portuguese. V-GlórIA supports multimodal tasks such as image captioning, retrieval, and dialogue. To deliver V-GlórIA, we leverage state-of-the-art V&L architectures, and contribute with PT-PT machine-translated pre-training (CC3M PT-PT) and benchmark (MSCOCO PT-PT and VisDial PT-PT) datasets.Our experiments show that V-GlórIA delivers promising performance in text-image retrieval and downstream tasks in a zero-shot setting, such as image captioning and visual dialogue tasks, highlighting the effectiveness of our customization approach.
%U https://aclanthology.org/2024.customnlp4u-1.24
%P 317-326
Markdown (Informal)
[V-GlórIA - Customizing Large Vision and Language Models to European Portuguese](https://aclanthology.org/2024.customnlp4u-1.24) (Simplício et al., CustomNLP4U 2024)
ACL