@inproceedings{liu-wan-2023-models,
title = "Models See Hallucinations: Evaluating the Factuality in Video Captioning",
author = "Liu, Hui and
Wan, Xiaojun",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.723",
doi = "10.18653/v1/2023.emnlp-main.723",
pages = "11807--11823",
abstract = "Video captioning aims to describe events in a video with natural language. In recent years, many works have focused on improving captioning models{'} performance. However, like other text generation tasks, it risks introducing factual errors not supported by the input video. Factual errors can seriously affect the quality of the generated text, sometimes making it completely unusable. Although factual consistency has received much research attention in text-to-text tasks (e.g., summarization), it is less studied in vision-based text generation. In this work, we conduct the first human evaluation of the factuality in video captioning and annotate two factuality datasets. We find that 56{\%} of the model-generated sentences have factual errors, indicating it is a severe problem in this field, but existing evaluation metrics show little correlation with human factuality annotation. We further propose a weakly-supervised, model-based factuality metric FactVC, which outperforms previous metrics on factuality evaluation of video captioning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-wan-2023-models">
<titleInfo>
<title>Models See Hallucinations: Evaluating the Factuality in Video Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Video captioning aims to describe events in a video with natural language. In recent years, many works have focused on improving captioning models’ performance. However, like other text generation tasks, it risks introducing factual errors not supported by the input video. Factual errors can seriously affect the quality of the generated text, sometimes making it completely unusable. Although factual consistency has received much research attention in text-to-text tasks (e.g., summarization), it is less studied in vision-based text generation. In this work, we conduct the first human evaluation of the factuality in video captioning and annotate two factuality datasets. We find that 56% of the model-generated sentences have factual errors, indicating it is a severe problem in this field, but existing evaluation metrics show little correlation with human factuality annotation. We further propose a weakly-supervised, model-based factuality metric FactVC, which outperforms previous metrics on factuality evaluation of video captioning.</abstract>
<identifier type="citekey">liu-wan-2023-models</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.723</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.723</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>11807</start>
<end>11823</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Models See Hallucinations: Evaluating the Factuality in Video Captioning
%A Liu, Hui
%A Wan, Xiaojun
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F liu-wan-2023-models
%X Video captioning aims to describe events in a video with natural language. In recent years, many works have focused on improving captioning models’ performance. However, like other text generation tasks, it risks introducing factual errors not supported by the input video. Factual errors can seriously affect the quality of the generated text, sometimes making it completely unusable. Although factual consistency has received much research attention in text-to-text tasks (e.g., summarization), it is less studied in vision-based text generation. In this work, we conduct the first human evaluation of the factuality in video captioning and annotate two factuality datasets. We find that 56% of the model-generated sentences have factual errors, indicating it is a severe problem in this field, but existing evaluation metrics show little correlation with human factuality annotation. We further propose a weakly-supervised, model-based factuality metric FactVC, which outperforms previous metrics on factuality evaluation of video captioning.
%R 10.18653/v1/2023.emnlp-main.723
%U https://aclanthology.org/2023.emnlp-main.723
%U https://doi.org/10.18653/v1/2023.emnlp-main.723
%P 11807-11823
Markdown (Informal)
[Models See Hallucinations: Evaluating the Factuality in Video Captioning](https://aclanthology.org/2023.emnlp-main.723) (Liu & Wan, EMNLP 2023)
ACL