@inproceedings{wang-etal-2026-muse,
title = "{M}u{S}e: Multi-Stage Graph Reasoning via Vision-Language Models",
author = "Wang, Guanyu and
Chu, Xu and
Tan, Zhijie and
Chen, Xinrong and
Mo, Tong and
Li, Weiping",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.476/",
pages = "10442--10462",
ISBN = "979-8-89176-390-6",
abstract = "Graph-related tasks are traditionally addressed with Graph Neural Networks (GNNs) or graph transformers, but their task-specific training limits generalization. Large Language Models (LLMs) offer stronger generalization, yet encoding graphs as one-dimensional text struggles to capture multi-hop dependencies and two-dimensional topology. Vision-Language Models (VLMs) provide an alternative by visualizing graphs, but rendering large graphs in a single image causes clutter, occlusion, and distraction, hindering reasoning. We propose MuSe, a novel multi-stage graph reasoning framework based on VLMs. Instead of processing entire graphs at once, MuSe incrementally samples and visualizes task-relevant subgraphs, enabling progressive reasoning. The framework employs a two-stage training paradigm: supervised fine-tuning to acquire local sampling and reasoning skills, followed by reinforcement learning with GRPO to refine the sampling strategy and control dialog length.To support evaluation, we introduce LGVLQA, a new multimodal dataset with larger and more complex graph structures, addressing the scalability limitations of existing benchmarks. Experiments show that MuSe consistently outperforms leading LLM and VLM baselines, demonstrating improved structural understanding and reasoning ability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-muse">
<titleInfo>
<title>MuSe: Multi-Stage Graph Reasoning via Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guanyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijie</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinrong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiping</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Graph-related tasks are traditionally addressed with Graph Neural Networks (GNNs) or graph transformers, but their task-specific training limits generalization. Large Language Models (LLMs) offer stronger generalization, yet encoding graphs as one-dimensional text struggles to capture multi-hop dependencies and two-dimensional topology. Vision-Language Models (VLMs) provide an alternative by visualizing graphs, but rendering large graphs in a single image causes clutter, occlusion, and distraction, hindering reasoning. We propose MuSe, a novel multi-stage graph reasoning framework based on VLMs. Instead of processing entire graphs at once, MuSe incrementally samples and visualizes task-relevant subgraphs, enabling progressive reasoning. The framework employs a two-stage training paradigm: supervised fine-tuning to acquire local sampling and reasoning skills, followed by reinforcement learning with GRPO to refine the sampling strategy and control dialog length.To support evaluation, we introduce LGVLQA, a new multimodal dataset with larger and more complex graph structures, addressing the scalability limitations of existing benchmarks. Experiments show that MuSe consistently outperforms leading LLM and VLM baselines, demonstrating improved structural understanding and reasoning ability.</abstract>
<identifier type="citekey">wang-etal-2026-muse</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.476/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>10442</start>
<end>10462</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MuSe: Multi-Stage Graph Reasoning via Vision-Language Models
%A Wang, Guanyu
%A Chu, Xu
%A Tan, Zhijie
%A Chen, Xinrong
%A Mo, Tong
%A Li, Weiping
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F wang-etal-2026-muse
%X Graph-related tasks are traditionally addressed with Graph Neural Networks (GNNs) or graph transformers, but their task-specific training limits generalization. Large Language Models (LLMs) offer stronger generalization, yet encoding graphs as one-dimensional text struggles to capture multi-hop dependencies and two-dimensional topology. Vision-Language Models (VLMs) provide an alternative by visualizing graphs, but rendering large graphs in a single image causes clutter, occlusion, and distraction, hindering reasoning. We propose MuSe, a novel multi-stage graph reasoning framework based on VLMs. Instead of processing entire graphs at once, MuSe incrementally samples and visualizes task-relevant subgraphs, enabling progressive reasoning. The framework employs a two-stage training paradigm: supervised fine-tuning to acquire local sampling and reasoning skills, followed by reinforcement learning with GRPO to refine the sampling strategy and control dialog length.To support evaluation, we introduce LGVLQA, a new multimodal dataset with larger and more complex graph structures, addressing the scalability limitations of existing benchmarks. Experiments show that MuSe consistently outperforms leading LLM and VLM baselines, demonstrating improved structural understanding and reasoning ability.
%U https://aclanthology.org/2026.acl-long.476/
%P 10442-10462
Markdown (Informal)
[MuSe: Multi-Stage Graph Reasoning via Vision-Language Models](https://aclanthology.org/2026.acl-long.476/) (Wang et al., ACL 2026)
ACL
- Guanyu Wang, Xu Chu, Zhijie Tan, Xinrong Chen, Tong Mo, and Weiping Li. 2026. MuSe: Multi-Stage Graph Reasoning via Vision-Language Models. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 10442–10462, San Diego, California, United States. Association for Computational Linguistics.