@inproceedings{pa-pa-aung-etal-2020-automatic,
title = "Automatic {M}yanmar Image Captioning using {CNN} and {LSTM}-Based Language Model",
author = "Pa Pa Aung, San and
Pa Pa, Win and
Nwe, Tin Lay",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.19",
pages = "139--143",
abstract = "An image captioning system involves modules on computer vision as well as natural language processing. Computer vision module is for detecting salient objects or extracting features of images and Natural Language Processing (NLP) module is for generating correct syntactic and semantic image captions. Although many image caption datasets such as Flickr8k, Flickr30k and MSCOCO are publicly available, most of the datasets are captioned in English language. There is no image caption corpus for Myanmar language. Myanmar image caption corpus is manually built as part of the Flickr8k dataset in this current work. Furthermore, a generative merge model based on Convolutional Neural Network (CNN) and Long-Short Term Memory (LSTM) is applied especially for Myanmar image captioning. Next, two conventional feature extraction models Visual Geometry Group (VGG) OxfordNet 16-layer and 19-layer are compared. The performance of this system is evaluated on Myanmar image caption corpus using BLEU scores and 10-fold cross validation.",
language = "English",
ISBN = "979-10-95546-35-1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pa-pa-aung-etal-2020-automatic">
<titleInfo>
<title>Automatic Myanmar Image Captioning using CNN and LSTM-Based Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">San</namePart>
<namePart type="family">Pa Pa Aung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Win</namePart>
<namePart type="family">Pa Pa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tin</namePart>
<namePart type="given">Lay</namePart>
<namePart type="family">Nwe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>An image captioning system involves modules on computer vision as well as natural language processing. Computer vision module is for detecting salient objects or extracting features of images and Natural Language Processing (NLP) module is for generating correct syntactic and semantic image captions. Although many image caption datasets such as Flickr8k, Flickr30k and MSCOCO are publicly available, most of the datasets are captioned in English language. There is no image caption corpus for Myanmar language. Myanmar image caption corpus is manually built as part of the Flickr8k dataset in this current work. Furthermore, a generative merge model based on Convolutional Neural Network (CNN) and Long-Short Term Memory (LSTM) is applied especially for Myanmar image captioning. Next, two conventional feature extraction models Visual Geometry Group (VGG) OxfordNet 16-layer and 19-layer are compared. The performance of this system is evaluated on Myanmar image caption corpus using BLEU scores and 10-fold cross validation.</abstract>
<identifier type="citekey">pa-pa-aung-etal-2020-automatic</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.19</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>139</start>
<end>143</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Myanmar Image Captioning using CNN and LSTM-Based Language Model
%A Pa Pa Aung, San
%A Pa Pa, Win
%A Nwe, Tin Lay
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G English
%F pa-pa-aung-etal-2020-automatic
%X An image captioning system involves modules on computer vision as well as natural language processing. Computer vision module is for detecting salient objects or extracting features of images and Natural Language Processing (NLP) module is for generating correct syntactic and semantic image captions. Although many image caption datasets such as Flickr8k, Flickr30k and MSCOCO are publicly available, most of the datasets are captioned in English language. There is no image caption corpus for Myanmar language. Myanmar image caption corpus is manually built as part of the Flickr8k dataset in this current work. Furthermore, a generative merge model based on Convolutional Neural Network (CNN) and Long-Short Term Memory (LSTM) is applied especially for Myanmar image captioning. Next, two conventional feature extraction models Visual Geometry Group (VGG) OxfordNet 16-layer and 19-layer are compared. The performance of this system is evaluated on Myanmar image caption corpus using BLEU scores and 10-fold cross validation.
%U https://aclanthology.org/2020.sltu-1.19
%P 139-143
Markdown (Informal)
[Automatic Myanmar Image Captioning using CNN and LSTM-Based Language Model](https://aclanthology.org/2020.sltu-1.19) (Pa Pa Aung et al., SLTU 2020)
ACL
- San Pa Pa Aung, Win Pa Pa, and Tin Lay Nwe. 2020. Automatic Myanmar Image Captioning using CNN and LSTM-Based Language Model. In Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL), pages 139–143, Marseille, France. European Language Resources association.