@inproceedings{kapur-kreiss-2024-reference,
title = "Reference-Based Metrics Are Biased Against Blind and Low-Vision Users{'} Image Description Preferences",
author = "Kapur, Rhea and
Kreiss, Elisa",
editor = "Dementieva, Daryna and
Ignat, Oana and
Jin, Zhijing and
Mihalcea, Rada and
Piatti, Giorgio and
Tetreault, Joel and
Wilson, Steven and
Zhao, Jieyu",
booktitle = "Proceedings of the Third Workshop on NLP for Positive Impact",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4pi-1.26",
doi = "10.18653/v1/2024.nlp4pi-1.26",
pages = "308--314",
abstract = "Image description generation models are sophisticated Vision-Language Models which promise to make visual content, such as images, non-visually accessible through linguistic descriptions. While these systems can benefit all, their primary motivation tends to lie in allowing blind and low-vision (BLV) users access to increasingly visual (online) discourse. Well-defined evaluation methods are crucial for steering model development into socially useful directions. In this work, we show that the most popular evaluation metrics (reference-based metrics) are biased against BLV users and therefore potentially stifle useful model development. Reference-based metrics assign quality scores based on the similarity to human-generated ground-truth descriptions and are widely accepted as neutrally representing the needs of all users. However, we find that these metrics are more strongly correlated with sighted participant ratings than BLV ratings, and we explore factors which appear to mediate this finding: description length, the image{'}s context of appearance, and the number of reference descriptions available. These findings suggest that there is a need for developing evaluation methods that are established based on specific downstream user groups, and they highlight the importance of reflecting on emerging biases against minorities in the development of general-purpose automatic metrics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kapur-kreiss-2024-reference">
<titleInfo>
<title>Reference-Based Metrics Are Biased Against Blind and Low-Vision Users’ Image Description Preferences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rhea</namePart>
<namePart type="family">Kapur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Kreiss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on NLP for Positive Impact</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Ignat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijing</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giorgio</namePart>
<namePart type="family">Piatti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Image description generation models are sophisticated Vision-Language Models which promise to make visual content, such as images, non-visually accessible through linguistic descriptions. While these systems can benefit all, their primary motivation tends to lie in allowing blind and low-vision (BLV) users access to increasingly visual (online) discourse. Well-defined evaluation methods are crucial for steering model development into socially useful directions. In this work, we show that the most popular evaluation metrics (reference-based metrics) are biased against BLV users and therefore potentially stifle useful model development. Reference-based metrics assign quality scores based on the similarity to human-generated ground-truth descriptions and are widely accepted as neutrally representing the needs of all users. However, we find that these metrics are more strongly correlated with sighted participant ratings than BLV ratings, and we explore factors which appear to mediate this finding: description length, the image’s context of appearance, and the number of reference descriptions available. These findings suggest that there is a need for developing evaluation methods that are established based on specific downstream user groups, and they highlight the importance of reflecting on emerging biases against minorities in the development of general-purpose automatic metrics.</abstract>
<identifier type="citekey">kapur-kreiss-2024-reference</identifier>
<identifier type="doi">10.18653/v1/2024.nlp4pi-1.26</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4pi-1.26</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>308</start>
<end>314</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reference-Based Metrics Are Biased Against Blind and Low-Vision Users’ Image Description Preferences
%A Kapur, Rhea
%A Kreiss, Elisa
%Y Dementieva, Daryna
%Y Ignat, Oana
%Y Jin, Zhijing
%Y Mihalcea, Rada
%Y Piatti, Giorgio
%Y Tetreault, Joel
%Y Wilson, Steven
%Y Zhao, Jieyu
%S Proceedings of the Third Workshop on NLP for Positive Impact
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kapur-kreiss-2024-reference
%X Image description generation models are sophisticated Vision-Language Models which promise to make visual content, such as images, non-visually accessible through linguistic descriptions. While these systems can benefit all, their primary motivation tends to lie in allowing blind and low-vision (BLV) users access to increasingly visual (online) discourse. Well-defined evaluation methods are crucial for steering model development into socially useful directions. In this work, we show that the most popular evaluation metrics (reference-based metrics) are biased against BLV users and therefore potentially stifle useful model development. Reference-based metrics assign quality scores based on the similarity to human-generated ground-truth descriptions and are widely accepted as neutrally representing the needs of all users. However, we find that these metrics are more strongly correlated with sighted participant ratings than BLV ratings, and we explore factors which appear to mediate this finding: description length, the image’s context of appearance, and the number of reference descriptions available. These findings suggest that there is a need for developing evaluation methods that are established based on specific downstream user groups, and they highlight the importance of reflecting on emerging biases against minorities in the development of general-purpose automatic metrics.
%R 10.18653/v1/2024.nlp4pi-1.26
%U https://aclanthology.org/2024.nlp4pi-1.26
%U https://doi.org/10.18653/v1/2024.nlp4pi-1.26
%P 308-314
Markdown (Informal)
[Reference-Based Metrics Are Biased Against Blind and Low-Vision Users’ Image Description Preferences](https://aclanthology.org/2024.nlp4pi-1.26) (Kapur & Kreiss, NLP4PI 2024)
ACL