@inproceedings{khan-etal-2024-chitranuvad,
title = "Chitranuvad: Adapting Multi-lingual {LLM}s for Multimodal Translation",
author = "Khan, Shaharukh and
Tarun, Ayush and
Faraz, Ali and
Kamble, Palash and
Dahiya, Vivek and
Pokala, Praveen and
Kulkarni, Ashish and
Khatri, Chandra and
Ravi, Abhinav and
Agarwal, Shubham",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.80",
pages = "839--851",
abstract = "In this work, we provide the system description of our submission as part of the English-to-Lowres Multimodal Translation Task at theWorkshop on Asian Translation (WAT2024). We introduce Chitranuvad, a multimodal model that effectively integrates Multilingual LLMand a vision module for Multimodal Translation. Our method uses a ViT image encoder to extract visual representations as visual tokenembeddings which are projected to the LLM space by an adapter layer and generates translation in an autoregressive fashion. We participated in all the three tracks (Image Captioning, Text-only and Multimodal translationtasks) for Indic languages (ie. English translation to Hindi, Bengali and Malyalam) and achieved SOTA results for Hindi in all of themon the Challenge set while remaining competitive for the other languages in the shared task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khan-etal-2024-chitranuvad">
<titleInfo>
<title>Chitranuvad: Adapting Multi-lingual LLMs for Multimodal Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shaharukh</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Tarun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Faraz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Palash</namePart>
<namePart type="family">Kamble</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Dahiya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Praveen</namePart>
<namePart type="family">Pokala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Kulkarni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chandra</namePart>
<namePart type="family">Khatri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Ravi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shubham</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we provide the system description of our submission as part of the English-to-Lowres Multimodal Translation Task at theWorkshop on Asian Translation (WAT2024). We introduce Chitranuvad, a multimodal model that effectively integrates Multilingual LLMand a vision module for Multimodal Translation. Our method uses a ViT image encoder to extract visual representations as visual tokenembeddings which are projected to the LLM space by an adapter layer and generates translation in an autoregressive fashion. We participated in all the three tracks (Image Captioning, Text-only and Multimodal translationtasks) for Indic languages (ie. English translation to Hindi, Bengali and Malyalam) and achieved SOTA results for Hindi in all of themon the Challenge set while remaining competitive for the other languages in the shared task.</abstract>
<identifier type="citekey">khan-etal-2024-chitranuvad</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.80</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>839</start>
<end>851</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chitranuvad: Adapting Multi-lingual LLMs for Multimodal Translation
%A Khan, Shaharukh
%A Tarun, Ayush
%A Faraz, Ali
%A Kamble, Palash
%A Dahiya, Vivek
%A Pokala, Praveen
%A Kulkarni, Ashish
%A Khatri, Chandra
%A Ravi, Abhinav
%A Agarwal, Shubham
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F khan-etal-2024-chitranuvad
%X In this work, we provide the system description of our submission as part of the English-to-Lowres Multimodal Translation Task at theWorkshop on Asian Translation (WAT2024). We introduce Chitranuvad, a multimodal model that effectively integrates Multilingual LLMand a vision module for Multimodal Translation. Our method uses a ViT image encoder to extract visual representations as visual tokenembeddings which are projected to the LLM space by an adapter layer and generates translation in an autoregressive fashion. We participated in all the three tracks (Image Captioning, Text-only and Multimodal translationtasks) for Indic languages (ie. English translation to Hindi, Bengali and Malyalam) and achieved SOTA results for Hindi in all of themon the Challenge set while remaining competitive for the other languages in the shared task.
%U https://aclanthology.org/2024.wmt-1.80
%P 839-851
Markdown (Informal)
[Chitranuvad: Adapting Multi-lingual LLMs for Multimodal Translation](https://aclanthology.org/2024.wmt-1.80) (Khan et al., WMT 2024)
ACL
- Shaharukh Khan, Ayush Tarun, Ali Faraz, Palash Kamble, Vivek Dahiya, Praveen Pokala, Ashish Kulkarni, Chandra Khatri, Abhinav Ravi, and Shubham Agarwal. 2024. Chitranuvad: Adapting Multi-lingual LLMs for Multimodal Translation. In Proceedings of the Ninth Conference on Machine Translation, pages 839–851, Miami, Florida, USA. Association for Computational Linguistics.