@inproceedings{zhang-wan-2023-exploring,
title = "Exploring the Impact of Vision Features in News Image Captioning",
author = "Zhang, Junzhe and
Wan, Xiaojun",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.818",
doi = "10.18653/v1/2023.findings-acl.818",
pages = "12923--12936",
abstract = "The task of news image captioning aims to generate a detailed caption which describes the specific information of an image in a news article. However, we find that recent state-of-art models can achieve competitive performance even without vision features. To resolve the impact of vision features in the news image captioning task, we conduct extensive experiments with mainstream models based on encoder-decoder framework. From our exploration, we find 1) vision features do contribute to the generation of news image captions; 2) vision features can assist models to better generate entities of captions when the entity information is sufficient in the input textual context of the given article; 3) Regions of specific objects in images contribute to the generation of related entities in captions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-wan-2023-exploring">
<titleInfo>
<title>Exploring the Impact of Vision Features in News Image Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junzhe</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The task of news image captioning aims to generate a detailed caption which describes the specific information of an image in a news article. However, we find that recent state-of-art models can achieve competitive performance even without vision features. To resolve the impact of vision features in the news image captioning task, we conduct extensive experiments with mainstream models based on encoder-decoder framework. From our exploration, we find 1) vision features do contribute to the generation of news image captions; 2) vision features can assist models to better generate entities of captions when the entity information is sufficient in the input textual context of the given article; 3) Regions of specific objects in images contribute to the generation of related entities in captions.</abstract>
<identifier type="citekey">zhang-wan-2023-exploring</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.818</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.818</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>12923</start>
<end>12936</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring the Impact of Vision Features in News Image Captioning
%A Zhang, Junzhe
%A Wan, Xiaojun
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F zhang-wan-2023-exploring
%X The task of news image captioning aims to generate a detailed caption which describes the specific information of an image in a news article. However, we find that recent state-of-art models can achieve competitive performance even without vision features. To resolve the impact of vision features in the news image captioning task, we conduct extensive experiments with mainstream models based on encoder-decoder framework. From our exploration, we find 1) vision features do contribute to the generation of news image captions; 2) vision features can assist models to better generate entities of captions when the entity information is sufficient in the input textual context of the given article; 3) Regions of specific objects in images contribute to the generation of related entities in captions.
%R 10.18653/v1/2023.findings-acl.818
%U https://aclanthology.org/2023.findings-acl.818
%U https://doi.org/10.18653/v1/2023.findings-acl.818
%P 12923-12936
Markdown (Informal)
[Exploring the Impact of Vision Features in News Image Captioning](https://aclanthology.org/2023.findings-acl.818) (Zhang & Wan, Findings 2023)
ACL