@inproceedings{karamolegkou-etal-2024-vision,
title = "Vision-Language Models under Cultural and Inclusive Considerations",
author = "Karamolegkou, Antonia and
Rust, Phillip and
Cui, Ruixiang and
Cao, Yong and
S{\o}gaard, Anders and
Hershcovich, Daniel",
editor = "Soni, Nikita and
Flek, Lucie and
Sharma, Ashish and
Yang, Diyi and
Hooker, Sara and
Schwartz, H. Andrew",
booktitle = "Proceedings of the 1st Human-Centered Large Language Modeling Workshop",
month = aug,
year = "2024",
address = "TBD",
publisher = "ACL",
url = "https://aclanthology.org/2024.hucllm-1.5/",
doi = "10.18653/v1/2024.hucllm-1.5",
pages = "53--66",
abstract = "Large Vision Language Models can be used to assist visually impaired individuals by describing images they capture in their daily lives. Current evaluation datasets may not reflect the diverse cultural user backgrounds nor the situational context of this use case. To address this problem, we create a survey to determine caption preferences and propose a culture-centric evaluation benchmark by filtering VizWiz, an existing dataset with images taken by people who are blind. We then evaluate different models and prompts, investigating their reliability as visual assistants. While the evaluation results for state-of-the-art models seem promising, we identified some weak spots such as hallucinations and problems with conventional evaluation metrics. Our survey, data, code, and model outputs will be publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karamolegkou-etal-2024-vision">
<titleInfo>
<title>Vision-Language Models under Cultural and Inclusive Considerations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antonia</namePart>
<namePart type="family">Karamolegkou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phillip</namePart>
<namePart type="family">Rust</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruixiang</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hershcovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Human-Centered Large Language Modeling Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Andrew</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ACL</publisher>
<place>
<placeTerm type="text">TBD</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Vision Language Models can be used to assist visually impaired individuals by describing images they capture in their daily lives. Current evaluation datasets may not reflect the diverse cultural user backgrounds nor the situational context of this use case. To address this problem, we create a survey to determine caption preferences and propose a culture-centric evaluation benchmark by filtering VizWiz, an existing dataset with images taken by people who are blind. We then evaluate different models and prompts, investigating their reliability as visual assistants. While the evaluation results for state-of-the-art models seem promising, we identified some weak spots such as hallucinations and problems with conventional evaluation metrics. Our survey, data, code, and model outputs will be publicly available.</abstract>
<identifier type="citekey">karamolegkou-etal-2024-vision</identifier>
<identifier type="doi">10.18653/v1/2024.hucllm-1.5</identifier>
<location>
<url>https://aclanthology.org/2024.hucllm-1.5/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>53</start>
<end>66</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Vision-Language Models under Cultural and Inclusive Considerations
%A Karamolegkou, Antonia
%A Rust, Phillip
%A Cui, Ruixiang
%A Cao, Yong
%A Søgaard, Anders
%A Hershcovich, Daniel
%Y Soni, Nikita
%Y Flek, Lucie
%Y Sharma, Ashish
%Y Yang, Diyi
%Y Hooker, Sara
%Y Schwartz, H. Andrew
%S Proceedings of the 1st Human-Centered Large Language Modeling Workshop
%D 2024
%8 August
%I ACL
%C TBD
%F karamolegkou-etal-2024-vision
%X Large Vision Language Models can be used to assist visually impaired individuals by describing images they capture in their daily lives. Current evaluation datasets may not reflect the diverse cultural user backgrounds nor the situational context of this use case. To address this problem, we create a survey to determine caption preferences and propose a culture-centric evaluation benchmark by filtering VizWiz, an existing dataset with images taken by people who are blind. We then evaluate different models and prompts, investigating their reliability as visual assistants. While the evaluation results for state-of-the-art models seem promising, we identified some weak spots such as hallucinations and problems with conventional evaluation metrics. Our survey, data, code, and model outputs will be publicly available.
%R 10.18653/v1/2024.hucllm-1.5
%U https://aclanthology.org/2024.hucllm-1.5/
%U https://doi.org/10.18653/v1/2024.hucllm-1.5
%P 53-66
Markdown (Informal)
[Vision-Language Models under Cultural and Inclusive Considerations](https://aclanthology.org/2024.hucllm-1.5/) (Karamolegkou et al., HuCLLM 2024)
ACL