@inproceedings{fletcher-etal-2024-generating,
title = "Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models",
author = "Fletcher, James and
Dehnen, Nicholas and
Tayarani Bathaie, Seyed Nima and
An, Aijun and
Davoudi, Heidar and
DiCarlantonio, Ron and
Farmaner, Gary",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.83",
pages = "1107--1120",
abstract = "To enhance a question-answering system for automotive drivers, we tackle the problem of automatic generation of icon image descriptions. The descriptions can match the driver{'}s query about the icon appearing on the dashboard and tell the driver what is happening so that they may take an appropriate action. We use three state-of-the-art large vision-language models to generate both visual and functional descriptions based on the icon image and its context information in the car manual. Both zero-shot and few-shot prompts are used. We create a dataset containing over 400 icons with their ground-truth descriptions and use it to evaluate model-generated descriptions across several performance metrics. Our evaluation shows that two of these models (GPT-4o and Claude 3.5) performed well on this task, while the third model (LLaVA-NEXT) performs poorly.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fletcher-etal-2024-generating">
<titleInfo>
<title>Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Fletcher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Dehnen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Nima</namePart>
<namePart type="family">Tayarani Bathaie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aijun</namePart>
<namePart type="family">An</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heidar</namePart>
<namePart type="family">Davoudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ron</namePart>
<namePart type="family">DiCarlantonio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gary</namePart>
<namePart type="family">Farmaner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>To enhance a question-answering system for automotive drivers, we tackle the problem of automatic generation of icon image descriptions. The descriptions can match the driver’s query about the icon appearing on the dashboard and tell the driver what is happening so that they may take an appropriate action. We use three state-of-the-art large vision-language models to generate both visual and functional descriptions based on the icon image and its context information in the car manual. Both zero-shot and few-shot prompts are used. We create a dataset containing over 400 icons with their ground-truth descriptions and use it to evaluate model-generated descriptions across several performance metrics. Our evaluation shows that two of these models (GPT-4o and Claude 3.5) performed well on this task, while the third model (LLaVA-NEXT) performs poorly.</abstract>
<identifier type="citekey">fletcher-etal-2024-generating</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.83</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1107</start>
<end>1120</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models
%A Fletcher, James
%A Dehnen, Nicholas
%A Tayarani Bathaie, Seyed Nima
%A An, Aijun
%A Davoudi, Heidar
%A DiCarlantonio, Ron
%A Farmaner, Gary
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F fletcher-etal-2024-generating
%X To enhance a question-answering system for automotive drivers, we tackle the problem of automatic generation of icon image descriptions. The descriptions can match the driver’s query about the icon appearing on the dashboard and tell the driver what is happening so that they may take an appropriate action. We use three state-of-the-art large vision-language models to generate both visual and functional descriptions based on the icon image and its context information in the car manual. Both zero-shot and few-shot prompts are used. We create a dataset containing over 400 icons with their ground-truth descriptions and use it to evaluate model-generated descriptions across several performance metrics. Our evaluation shows that two of these models (GPT-4o and Claude 3.5) performed well on this task, while the third model (LLaVA-NEXT) performs poorly.
%U https://aclanthology.org/2024.emnlp-industry.83
%P 1107-1120
Markdown (Informal)
[Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models](https://aclanthology.org/2024.emnlp-industry.83) (Fletcher et al., EMNLP 2024)
ACL
- James Fletcher, Nicholas Dehnen, Seyed Nima Tayarani Bathaie, Aijun An, Heidar Davoudi, Ron DiCarlantonio, and Gary Farmaner. 2024. Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1107–1120, Miami, Florida, US. Association for Computational Linguistics.