@inproceedings{kao-chen-2024-visualizing,
title = "Visualizing Dialogues: Enhancing Image Selection through Dialogue Understanding with Large Language Models",
author = "Kao, Chang-Sheng and
Chen, Yun-Nung",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.700/",
doi = "10.18653/v1/2024.findings-acl.700",
pages = "11777--11788",
abstract = "For dialogue systems, the utilization of multimodal dialogue responses, as opposed to relying solely on text-only responses, offers the capability to describe different concepts through various modalities. This enhances the effectiveness of communication and elevates the overall conversational experience. However, current methods for dialogue-to-image retrieval are constrained by the capabilities of the pre-trained vision language models (VLMs). They struggle to accurately extract key information from conversations and are unable to handle long-turn conversations. In this paper, we leverage the reasoning capabilities of large language models (LLMs) to predict the potential features that may be present in the images to be shared, based on the dialogue context. This approach allows us to obtain succinct and precise descriptors, thereby improving the performance of text-image retrieval. Experimental results shows that our method outperforms previous approaches significantly in terms of Recall@k."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kao-chen-2024-visualizing">
<titleInfo>
<title>Visualizing Dialogues: Enhancing Image Selection through Dialogue Understanding with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chang-Sheng</namePart>
<namePart type="family">Kao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>For dialogue systems, the utilization of multimodal dialogue responses, as opposed to relying solely on text-only responses, offers the capability to describe different concepts through various modalities. This enhances the effectiveness of communication and elevates the overall conversational experience. However, current methods for dialogue-to-image retrieval are constrained by the capabilities of the pre-trained vision language models (VLMs). They struggle to accurately extract key information from conversations and are unable to handle long-turn conversations. In this paper, we leverage the reasoning capabilities of large language models (LLMs) to predict the potential features that may be present in the images to be shared, based on the dialogue context. This approach allows us to obtain succinct and precise descriptors, thereby improving the performance of text-image retrieval. Experimental results shows that our method outperforms previous approaches significantly in terms of Recall@k.</abstract>
<identifier type="citekey">kao-chen-2024-visualizing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.700</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.700/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>11777</start>
<end>11788</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Visualizing Dialogues: Enhancing Image Selection through Dialogue Understanding with Large Language Models
%A Kao, Chang-Sheng
%A Chen, Yun-Nung
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F kao-chen-2024-visualizing
%X For dialogue systems, the utilization of multimodal dialogue responses, as opposed to relying solely on text-only responses, offers the capability to describe different concepts through various modalities. This enhances the effectiveness of communication and elevates the overall conversational experience. However, current methods for dialogue-to-image retrieval are constrained by the capabilities of the pre-trained vision language models (VLMs). They struggle to accurately extract key information from conversations and are unable to handle long-turn conversations. In this paper, we leverage the reasoning capabilities of large language models (LLMs) to predict the potential features that may be present in the images to be shared, based on the dialogue context. This approach allows us to obtain succinct and precise descriptors, thereby improving the performance of text-image retrieval. Experimental results shows that our method outperforms previous approaches significantly in terms of Recall@k.
%R 10.18653/v1/2024.findings-acl.700
%U https://aclanthology.org/2024.findings-acl.700/
%U https://doi.org/10.18653/v1/2024.findings-acl.700
%P 11777-11788
Markdown (Informal)
[Visualizing Dialogues: Enhancing Image Selection through Dialogue Understanding with Large Language Models](https://aclanthology.org/2024.findings-acl.700/) (Kao & Chen, Findings 2024)
ACL