@inproceedings{wang-gaizauskas-2016-cross,
title = "Cross-validating Image Description Datasets and Evaluation Metrics",
author = "Wang, Josiah and
Gaizauskas, Robert",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1489",
pages = "3059--3066",
abstract = "The task of automatically generating sentential descriptions of image content has become increasingly popular in recent years, resulting in the development of large-scale image description datasets and the proposal of various metrics for evaluating image description generation systems. However, not much work has been done to analyse and understand both datasets and the metrics. In this paper, we propose using a leave-one-out cross validation (LOOCV) process as a means to analyse multiply annotated, human-authored image description datasets and the various evaluation metrics, i.e. evaluating one image description against other human-authored descriptions of the same image. Such an evaluation process affords various insights into the image description datasets and evaluation metrics, such as the variations of image descriptions within and across datasets and also what the metrics capture. We compute and analyse (i) human upper-bound performance; (ii) ranked correlation between metric pairs across datasets; (iii) lower-bound performance by comparing a set of descriptions describing one image to another sentence not describing that image. Interesting observations are made about the evaluation metrics and image description datasets, and we conclude that such cross-validation methods are extremely useful for assessing and gaining insights into image description datasets and evaluation metrics for image descriptions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-gaizauskas-2016-cross">
<titleInfo>
<title>Cross-validating Image Description Datasets and Evaluation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Josiah</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Gaizauskas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The task of automatically generating sentential descriptions of image content has become increasingly popular in recent years, resulting in the development of large-scale image description datasets and the proposal of various metrics for evaluating image description generation systems. However, not much work has been done to analyse and understand both datasets and the metrics. In this paper, we propose using a leave-one-out cross validation (LOOCV) process as a means to analyse multiply annotated, human-authored image description datasets and the various evaluation metrics, i.e. evaluating one image description against other human-authored descriptions of the same image. Such an evaluation process affords various insights into the image description datasets and evaluation metrics, such as the variations of image descriptions within and across datasets and also what the metrics capture. We compute and analyse (i) human upper-bound performance; (ii) ranked correlation between metric pairs across datasets; (iii) lower-bound performance by comparing a set of descriptions describing one image to another sentence not describing that image. Interesting observations are made about the evaluation metrics and image description datasets, and we conclude that such cross-validation methods are extremely useful for assessing and gaining insights into image description datasets and evaluation metrics for image descriptions.</abstract>
<identifier type="citekey">wang-gaizauskas-2016-cross</identifier>
<location>
<url>https://aclanthology.org/L16-1489</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>3059</start>
<end>3066</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-validating Image Description Datasets and Evaluation Metrics
%A Wang, Josiah
%A Gaizauskas, Robert
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F wang-gaizauskas-2016-cross
%X The task of automatically generating sentential descriptions of image content has become increasingly popular in recent years, resulting in the development of large-scale image description datasets and the proposal of various metrics for evaluating image description generation systems. However, not much work has been done to analyse and understand both datasets and the metrics. In this paper, we propose using a leave-one-out cross validation (LOOCV) process as a means to analyse multiply annotated, human-authored image description datasets and the various evaluation metrics, i.e. evaluating one image description against other human-authored descriptions of the same image. Such an evaluation process affords various insights into the image description datasets and evaluation metrics, such as the variations of image descriptions within and across datasets and also what the metrics capture. We compute and analyse (i) human upper-bound performance; (ii) ranked correlation between metric pairs across datasets; (iii) lower-bound performance by comparing a set of descriptions describing one image to another sentence not describing that image. Interesting observations are made about the evaluation metrics and image description datasets, and we conclude that such cross-validation methods are extremely useful for assessing and gaining insights into image description datasets and evaluation metrics for image descriptions.
%U https://aclanthology.org/L16-1489
%P 3059-3066
Markdown (Informal)
[Cross-validating Image Description Datasets and Evaluation Metrics](https://aclanthology.org/L16-1489) (Wang & Gaizauskas, LREC 2016)
ACL