@inproceedings{subedi-krishna-bal-2022-cnn,
title = "{CNN}-Transformer based Encoder-Decoder Model for {N}epali Image Captioning",
author = "Subedi, Bipesh and
Krishna Bal, Bal",
editor = "Akhtar, Md. Shad and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2022",
address = "New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-main.12/",
pages = "86--91",
abstract = "Many image captioning tasks have been carried out in recent years, the majority of the work being for the English language. A few research works have also been carried out for Hindi and Bengali languages in the domain. Unfortunately, not much research emphasis seems to be given to the Nepali language in this direction. Furthermore, the datasets are also not publicly available in the Nepali language. The aim of this research is to prepare a dataset with Nepali captions and develop a deep learning model based on the Convolutional Neural Network (CNN) and Transformer combined model to automatically generate image captions in the Nepali language. The dataset for this work is prepared by applying different data preprocessing techniques on the Flickr8k dataset. The preprocessed data is then passed to the CNN-Transformer model to generate image captions. ResNet-101 and EfficientNetB0 are the two pre-trained CNN models employed for this work. We have achieved some promising results which can be further improved in the future."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="subedi-krishna-bal-2022-cnn">
<titleInfo>
<title>CNN-Transformer based Encoder-Decoder Model for Nepali Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bipesh</namePart>
<namePart type="family">Subedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bal</namePart>
<namePart type="family">Krishna Bal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many image captioning tasks have been carried out in recent years, the majority of the work being for the English language. A few research works have also been carried out for Hindi and Bengali languages in the domain. Unfortunately, not much research emphasis seems to be given to the Nepali language in this direction. Furthermore, the datasets are also not publicly available in the Nepali language. The aim of this research is to prepare a dataset with Nepali captions and develop a deep learning model based on the Convolutional Neural Network (CNN) and Transformer combined model to automatically generate image captions in the Nepali language. The dataset for this work is prepared by applying different data preprocessing techniques on the Flickr8k dataset. The preprocessed data is then passed to the CNN-Transformer model to generate image captions. ResNet-101 and EfficientNetB0 are the two pre-trained CNN models employed for this work. We have achieved some promising results which can be further improved in the future.</abstract>
<identifier type="citekey">subedi-krishna-bal-2022-cnn</identifier>
<location>
<url>https://aclanthology.org/2022.icon-main.12/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>86</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CNN-Transformer based Encoder-Decoder Model for Nepali Image Captioning
%A Subedi, Bipesh
%A Krishna Bal, Bal
%Y Akhtar, Md. Shad
%Y Chakraborty, Tanmoy
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON)
%D 2022
%8 December
%I Association for Computational Linguistics
%C New Delhi, India
%F subedi-krishna-bal-2022-cnn
%X Many image captioning tasks have been carried out in recent years, the majority of the work being for the English language. A few research works have also been carried out for Hindi and Bengali languages in the domain. Unfortunately, not much research emphasis seems to be given to the Nepali language in this direction. Furthermore, the datasets are also not publicly available in the Nepali language. The aim of this research is to prepare a dataset with Nepali captions and develop a deep learning model based on the Convolutional Neural Network (CNN) and Transformer combined model to automatically generate image captions in the Nepali language. The dataset for this work is prepared by applying different data preprocessing techniques on the Flickr8k dataset. The preprocessed data is then passed to the CNN-Transformer model to generate image captions. ResNet-101 and EfficientNetB0 are the two pre-trained CNN models employed for this work. We have achieved some promising results which can be further improved in the future.
%U https://aclanthology.org/2022.icon-main.12/
%P 86-91
Markdown (Informal)
[CNN-Transformer based Encoder-Decoder Model for Nepali Image Captioning](https://aclanthology.org/2022.icon-main.12/) (Subedi & Krishna Bal, ICON 2022)
ACL