@inproceedings{kamigaito-etal-2023-table,
title = "Table and Image Generation for Investigating Knowledge of Entities in Pre-trained Vision and Language Models",
author = "Kamigaito, Hidetaka and
Hayashi, Katsuhiko and
Watanabe, Taro",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-short.162",
doi = "10.18653/v1/2023.acl-short.162",
pages = "1904--1917",
abstract = "In this paper, we propose a table and image generation task to verify how the knowledge about entities acquired from natural language is retained in Vision {\&} Language (V {\&} L) models. This task consists of two parts: the first is to generate a table containing knowledge about an entity and its related image, and the second is to generate an image from an entity with a caption and a table containing related knowledge of the entity. In both tasks, the model must know the entities used to perform the generation properly. We created the Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000 infoboxes in English Wikipedia articles to perform the proposed tasks. We evaluated the performance on the tasks with respect to the above research question using the V {\&} L model OFA, which has achieved state-of-the-art results in multiple tasks. Experimental results show that OFA forgets part of its entity knowledge by pre-training as a complement to improve the performance of image related tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kamigaito-etal-2023-table">
<titleInfo>
<title>Table and Image Generation for Investigating Knowledge of Entities in Pre-trained Vision and Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hidetaka</namePart>
<namePart type="family">Kamigaito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katsuhiko</namePart>
<namePart type="family">Hayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we propose a table and image generation task to verify how the knowledge about entities acquired from natural language is retained in Vision & Language (V & L) models. This task consists of two parts: the first is to generate a table containing knowledge about an entity and its related image, and the second is to generate an image from an entity with a caption and a table containing related knowledge of the entity. In both tasks, the model must know the entities used to perform the generation properly. We created the Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000 infoboxes in English Wikipedia articles to perform the proposed tasks. We evaluated the performance on the tasks with respect to the above research question using the V & L model OFA, which has achieved state-of-the-art results in multiple tasks. Experimental results show that OFA forgets part of its entity knowledge by pre-training as a complement to improve the performance of image related tasks.</abstract>
<identifier type="citekey">kamigaito-etal-2023-table</identifier>
<identifier type="doi">10.18653/v1/2023.acl-short.162</identifier>
<location>
<url>https://aclanthology.org/2023.acl-short.162</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>1904</start>
<end>1917</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Table and Image Generation for Investigating Knowledge of Entities in Pre-trained Vision and Language Models
%A Kamigaito, Hidetaka
%A Hayashi, Katsuhiko
%A Watanabe, Taro
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F kamigaito-etal-2023-table
%X In this paper, we propose a table and image generation task to verify how the knowledge about entities acquired from natural language is retained in Vision & Language (V & L) models. This task consists of two parts: the first is to generate a table containing knowledge about an entity and its related image, and the second is to generate an image from an entity with a caption and a table containing related knowledge of the entity. In both tasks, the model must know the entities used to perform the generation properly. We created the Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000 infoboxes in English Wikipedia articles to perform the proposed tasks. We evaluated the performance on the tasks with respect to the above research question using the V & L model OFA, which has achieved state-of-the-art results in multiple tasks. Experimental results show that OFA forgets part of its entity knowledge by pre-training as a complement to improve the performance of image related tasks.
%R 10.18653/v1/2023.acl-short.162
%U https://aclanthology.org/2023.acl-short.162
%U https://doi.org/10.18653/v1/2023.acl-short.162
%P 1904-1917
Markdown (Informal)
[Table and Image Generation for Investigating Knowledge of Entities in Pre-trained Vision and Language Models](https://aclanthology.org/2023.acl-short.162) (Kamigaito et al., ACL 2023)
ACL