@inproceedings{yang-okazaki-2020-image,
title = "Image Caption Generation for News Articles",
author = "Yang, Zhishen and
Okazaki, Naoaki",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.176",
doi = "10.18653/v1/2020.coling-main.176",
pages = "1941--1951",
abstract = "In this paper, we address the task of news-image captioning, which generates a description of an image given the image and its article body as input. This task is more challenging than the conventional image captioning, because it requires a joint understanding of image and text. We present a Transformer model that integrates text and image modalities and attends to textual features from visual features in generating a caption. Experiments based on automatic evaluation metrics and human evaluation show that an article text provides primary information to reproduce news-image captions written by journalists. The results also demonstrate that the proposed model outperforms the state-of-the-art model. In addition, we also confirm that visual features contribute to improving the quality of news-image captions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-okazaki-2020-image">
<titleInfo>
<title>Image Caption Generation for News Articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhishen</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we address the task of news-image captioning, which generates a description of an image given the image and its article body as input. This task is more challenging than the conventional image captioning, because it requires a joint understanding of image and text. We present a Transformer model that integrates text and image modalities and attends to textual features from visual features in generating a caption. Experiments based on automatic evaluation metrics and human evaluation show that an article text provides primary information to reproduce news-image captions written by journalists. The results also demonstrate that the proposed model outperforms the state-of-the-art model. In addition, we also confirm that visual features contribute to improving the quality of news-image captions.</abstract>
<identifier type="citekey">yang-okazaki-2020-image</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.176</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.176</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>1941</start>
<end>1951</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Image Caption Generation for News Articles
%A Yang, Zhishen
%A Okazaki, Naoaki
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F yang-okazaki-2020-image
%X In this paper, we address the task of news-image captioning, which generates a description of an image given the image and its article body as input. This task is more challenging than the conventional image captioning, because it requires a joint understanding of image and text. We present a Transformer model that integrates text and image modalities and attends to textual features from visual features in generating a caption. Experiments based on automatic evaluation metrics and human evaluation show that an article text provides primary information to reproduce news-image captions written by journalists. The results also demonstrate that the proposed model outperforms the state-of-the-art model. In addition, we also confirm that visual features contribute to improving the quality of news-image captions.
%R 10.18653/v1/2020.coling-main.176
%U https://aclanthology.org/2020.coling-main.176
%U https://doi.org/10.18653/v1/2020.coling-main.176
%P 1941-1951
Markdown (Informal)
[Image Caption Generation for News Articles](https://aclanthology.org/2020.coling-main.176) (Yang & Okazaki, COLING 2020)
ACL
- Zhishen Yang and Naoaki Okazaki. 2020. Image Caption Generation for News Articles. In Proceedings of the 28th International Conference on Computational Linguistics, pages 1941–1951, Barcelona, Spain (Online). International Committee on Computational Linguistics.