@inproceedings{jiang-etal-2019-tiger,
    title = "{TIGE}r: Text-to-Image Grounding for Image Caption Evaluation",
    author = "Jiang, Ming  and
      Huang, Qiuyuan  and
      Zhang, Lei  and
      Wang, Xin  and
      Zhang, Pengchuan  and
      Gan, Zhe  and
      Diesner, Jana  and
      Gao, Jianfeng",
    editor = "Inui, Kentaro  and
      Jiang, Jing  and
      Ng, Vincent  and
      Wan, Xiaojun",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D19-1220/",
    doi = "10.18653/v1/D19-1220",
    pages = "2141--2152",
    abstract = "This paper presents a new metric called TIGEr for the automatic evaluation of image captioning systems. Popular metrics, such as BLEU and CIDEr, are based solely on text matching between reference captions and machine-generated captions, potentially leading to biased evaluations because references may not fully cover the image content and natural language is inherently ambiguous. Building upon a machine-learned text-image grounding model, TIGEr allows to evaluate caption quality not only based on how well a caption represents image content, but also on how well machine-generated captions match human-generated captions. Our empirical tests show that TIGEr has a higher consistency with human judgments than alternative existing metrics. We also comprehensively assess the metric{'}s effectiveness in caption evaluation by measuring the correlation between human judgments and metric scores."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2019-tiger">
    <titleInfo>
        <title>TIGEr: Text-to-Image Grounding for Image Caption Evaluation</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Ming</namePart>
        <namePart type="family">Jiang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Qiuyuan</namePart>
        <namePart type="family">Huang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Lei</namePart>
        <namePart type="family">Zhang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Xin</namePart>
        <namePart type="family">Wang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Pengchuan</namePart>
        <namePart type="family">Zhang</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Zhe</namePart>
        <namePart type="family">Gan</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Jana</namePart>
        <namePart type="family">Diesner</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Jianfeng</namePart>
        <namePart type="family">Gao</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2019-11</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Kentaro</namePart>
            <namePart type="family">Inui</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Jing</namePart>
            <namePart type="family">Jiang</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Vincent</namePart>
            <namePart type="family">Ng</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Xiaojun</namePart>
            <namePart type="family">Wan</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>Association for Computational Linguistics</publisher>
            <place>
                <placeTerm type="text">Hong Kong, China</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
    </relatedItem>
    <abstract>This paper presents a new metric called TIGEr for the automatic evaluation of image captioning systems. Popular metrics, such as BLEU and CIDEr, are based solely on text matching between reference captions and machine-generated captions, potentially leading to biased evaluations because references may not fully cover the image content and natural language is inherently ambiguous. Building upon a machine-learned text-image grounding model, TIGEr allows to evaluate caption quality not only based on how well a caption represents image content, but also on how well machine-generated captions match human-generated captions. Our empirical tests show that TIGEr has a higher consistency with human judgments than alternative existing metrics. We also comprehensively assess the metric’s effectiveness in caption evaluation by measuring the correlation between human judgments and metric scores.</abstract>
    <identifier type="citekey">jiang-etal-2019-tiger</identifier>
    <identifier type="doi">10.18653/v1/D19-1220</identifier>
    <location>
        <url>https://aclanthology.org/D19-1220/</url>
    </location>
    <part>
        <date>2019-11</date>
        <extent unit="page">
            <start>2141</start>
            <end>2152</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TIGEr: Text-to-Image Grounding for Image Caption Evaluation
%A Jiang, Ming
%A Huang, Qiuyuan
%A Zhang, Lei
%A Wang, Xin
%A Zhang, Pengchuan
%A Gan, Zhe
%A Diesner, Jana
%A Gao, Jianfeng
%Y Inui, Kentaro
%Y Jiang, Jing
%Y Ng, Vincent
%Y Wan, Xiaojun
%S Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F jiang-etal-2019-tiger
%X This paper presents a new metric called TIGEr for the automatic evaluation of image captioning systems. Popular metrics, such as BLEU and CIDEr, are based solely on text matching between reference captions and machine-generated captions, potentially leading to biased evaluations because references may not fully cover the image content and natural language is inherently ambiguous. Building upon a machine-learned text-image grounding model, TIGEr allows to evaluate caption quality not only based on how well a caption represents image content, but also on how well machine-generated captions match human-generated captions. Our empirical tests show that TIGEr has a higher consistency with human judgments than alternative existing metrics. We also comprehensively assess the metric’s effectiveness in caption evaluation by measuring the correlation between human judgments and metric scores.
%R 10.18653/v1/D19-1220
%U https://aclanthology.org/D19-1220/
%U https://doi.org/10.18653/v1/D19-1220
%P 2141-2152
Markdown (Informal)
[TIGEr: Text-to-Image Grounding for Image Caption Evaluation](https://aclanthology.org/D19-1220/) (Jiang et al., EMNLP-IJCNLP 2019)
ACL
- Ming Jiang, Qiuyuan Huang, Lei Zhang, Xin Wang, Pengchuan Zhang, Zhe Gan, Jana Diesner, and Jianfeng Gao. 2019. TIGEr: Text-to-Image Grounding for Image Caption Evaluation. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 2141–2152, Hong Kong, China. Association for Computational Linguistics.