@inproceedings{das-etal-2024-modality,
title = "Which Modality should {I} use - Text, Motif, or Image? : Understanding Graphs with Large Language Models",
author = "Das, Debarati and
Gupta, Ishaan and
Srivastava, Jaideep and
Kang, Dongyeop",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.34",
doi = "10.18653/v1/2024.findings-naacl.34",
pages = "503--519",
abstract = "Our research integrates graph data with Large Language Models (LLMs), which, despite their advancements in various fields using large text corpora, face limitations in encoding entire graphs due to context size constraints. This paper introduces a new approach to encoding a graph with diverse modalities, such as text, image, and motif, coupled with prompts to approximate a graph{'}s global connectivity, thereby enhancing LLMs{'} efficiency in processing complex graph structures. The study also presents GraphTMI, a novel benchmark for evaluating LLMs in graph structure analysis, focusing on homophily, motif presence, and graph difficulty. Key findings indicate that the image modality, especially with vision-language models like GPT-4V, is superior to text in balancing token limits and preserving essential information and comes close to prior graph neural net (GNN) encoders. Furthermore, the research assesses how various factors affect the performance of each encoding modality and outlines the existing challenges and potential future developments for LLMs in graph understanding and reasoning tasks. Our code and data are publicly available on our project page - https://minnesotanlp.github.io/GraphLLM/",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="das-etal-2024-modality">
<titleInfo>
<title>Which Modality should I use - Text, Motif, or Image? : Understanding Graphs with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debarati</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ishaan</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaideep</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyeop</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Our research integrates graph data with Large Language Models (LLMs), which, despite their advancements in various fields using large text corpora, face limitations in encoding entire graphs due to context size constraints. This paper introduces a new approach to encoding a graph with diverse modalities, such as text, image, and motif, coupled with prompts to approximate a graph’s global connectivity, thereby enhancing LLMs’ efficiency in processing complex graph structures. The study also presents GraphTMI, a novel benchmark for evaluating LLMs in graph structure analysis, focusing on homophily, motif presence, and graph difficulty. Key findings indicate that the image modality, especially with vision-language models like GPT-4V, is superior to text in balancing token limits and preserving essential information and comes close to prior graph neural net (GNN) encoders. Furthermore, the research assesses how various factors affect the performance of each encoding modality and outlines the existing challenges and potential future developments for LLMs in graph understanding and reasoning tasks. Our code and data are publicly available on our project page - https://minnesotanlp.github.io/GraphLLM/</abstract>
<identifier type="citekey">das-etal-2024-modality</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.34</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.34</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>503</start>
<end>519</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Which Modality should I use - Text, Motif, or Image? : Understanding Graphs with Large Language Models
%A Das, Debarati
%A Gupta, Ishaan
%A Srivastava, Jaideep
%A Kang, Dongyeop
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F das-etal-2024-modality
%X Our research integrates graph data with Large Language Models (LLMs), which, despite their advancements in various fields using large text corpora, face limitations in encoding entire graphs due to context size constraints. This paper introduces a new approach to encoding a graph with diverse modalities, such as text, image, and motif, coupled with prompts to approximate a graph’s global connectivity, thereby enhancing LLMs’ efficiency in processing complex graph structures. The study also presents GraphTMI, a novel benchmark for evaluating LLMs in graph structure analysis, focusing on homophily, motif presence, and graph difficulty. Key findings indicate that the image modality, especially with vision-language models like GPT-4V, is superior to text in balancing token limits and preserving essential information and comes close to prior graph neural net (GNN) encoders. Furthermore, the research assesses how various factors affect the performance of each encoding modality and outlines the existing challenges and potential future developments for LLMs in graph understanding and reasoning tasks. Our code and data are publicly available on our project page - https://minnesotanlp.github.io/GraphLLM/
%R 10.18653/v1/2024.findings-naacl.34
%U https://aclanthology.org/2024.findings-naacl.34
%U https://doi.org/10.18653/v1/2024.findings-naacl.34
%P 503-519
Markdown (Informal)
[Which Modality should I use - Text, Motif, or Image? : Understanding Graphs with Large Language Models](https://aclanthology.org/2024.findings-naacl.34) (Das et al., Findings 2024)
ACL