@inproceedings{ilinykh-dobnik-2025-surprisal,
title = "Surprisal reveals diversity gaps in image captioning and different scorers change the story",
author = "Ilinykh, Nikolai and
Dobnik, Simon",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.inlg-main.22/",
pages = "366--375",
abstract = "We quantify linguistic diversity in image captioning with surprisal variance {--} the spread of token-level negative log-probabilities within a caption set. On the MSCOCO test set, we compare five state-of-the-art vision-and-language LLMs, decoded with greedy and nucleus sampling, to human captions. Measured with a caption-trained n-gram LM, humans display roughly twice the surprisal variance of models, but rescoring the same captions with a general-language model reverses the pattern. Our analysis introduces the surprisal-based diversity metric for image captioning. We show that relying on a single scorer can completely invert conclusions, thus, robust diversity evaluation must report surprisal under several scorers."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ilinykh-dobnik-2025-surprisal">
<titleInfo>
<title>Surprisal reveals diversity gaps in image captioning and different scorers change the story</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Narayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lê</namePart>
<namePart type="given">Hồng</namePart>
<namePart type="family">Phương</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hanoi, Vietnam</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We quantify linguistic diversity in image captioning with surprisal variance – the spread of token-level negative log-probabilities within a caption set. On the MSCOCO test set, we compare five state-of-the-art vision-and-language LLMs, decoded with greedy and nucleus sampling, to human captions. Measured with a caption-trained n-gram LM, humans display roughly twice the surprisal variance of models, but rescoring the same captions with a general-language model reverses the pattern. Our analysis introduces the surprisal-based diversity metric for image captioning. We show that relying on a single scorer can completely invert conclusions, thus, robust diversity evaluation must report surprisal under several scorers.</abstract>
<identifier type="citekey">ilinykh-dobnik-2025-surprisal</identifier>
<location>
<url>https://aclanthology.org/2025.inlg-main.22/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>366</start>
<end>375</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Surprisal reveals diversity gaps in image captioning and different scorers change the story
%A Ilinykh, Nikolai
%A Dobnik, Simon
%Y Flek, Lucie
%Y Narayan, Shashi
%Y Phương, Lê Hồng
%Y Pei, Jiahuan
%S Proceedings of the 18th International Natural Language Generation Conference
%D 2025
%8 October
%I Association for Computational Linguistics
%C Hanoi, Vietnam
%F ilinykh-dobnik-2025-surprisal
%X We quantify linguistic diversity in image captioning with surprisal variance – the spread of token-level negative log-probabilities within a caption set. On the MSCOCO test set, we compare five state-of-the-art vision-and-language LLMs, decoded with greedy and nucleus sampling, to human captions. Measured with a caption-trained n-gram LM, humans display roughly twice the surprisal variance of models, but rescoring the same captions with a general-language model reverses the pattern. Our analysis introduces the surprisal-based diversity metric for image captioning. We show that relying on a single scorer can completely invert conclusions, thus, robust diversity evaluation must report surprisal under several scorers.
%U https://aclanthology.org/2025.inlg-main.22/
%P 366-375
Markdown (Informal)
[Surprisal reveals diversity gaps in image captioning and different scorers change the story](https://aclanthology.org/2025.inlg-main.22/) (Ilinykh & Dobnik, INLG 2025)
ACL