@inproceedings{chen-etal-2026-gobench,
title = "{GOB}ench: Stage-Wise Diagnostics and the Visual Paradox in Multimodal Graph Optimization",
author = "Chen, Yinghao and
Xie, Wantong and
Zeng, Shuli and
Zhang, Sijia and
Pan, Xiaotian and
Wu, Feng and
Li, Xiangyang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.306/",
pages = "6144--6167",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) and vision-language models (VLMs) are increasingly used as optimization assistants to produce solutions, generate solver-executable programs, or both. However, current evaluations are misaligned with deployment in three ways: they (P1) fail to represent multimodal problem specifications, (P2) score outcomes only and cannot localize where failures occur along the modeling pipeline, and (P3) rarely report inference cost, obscuring reliability{--}cost trade-offs. We introduce Graph Optimization benchmark (GOBench), an aligned multimodal benchmark with solver-derived oracles and a four-layer diagnostic protocol that evaluates intermediate artifacts as well as end results, together with the Visual Inference Penalty (VIP) to measure multimodal overhead. Across frontier and open-weight models under paired text-only vs. T+V settings, we find that vision reliably increases inference cost, while its reliability impact is regime-dependent: frontier models often benefit from visual grounding, whereas several mid-tier/open models exhibit a Visual Paradox where vision reduces downstream executability and verification coverage. End-to-end success is frequently bottlenecked by intermediate-stage dropout; supervised fine-tuning on intermediate targets can mitigate this attrition in open models, enabling a reproducible harness for diagnosing failure modes and quantifying reliability{--}cost trade-offs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-gobench">
<titleInfo>
<title>GOBench: Stage-Wise Diagnostics and the Visual Paradox in Multimodal Graph Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yinghao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wantong</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuli</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sijia</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaotian</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feng</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangyang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) and vision-language models (VLMs) are increasingly used as optimization assistants to produce solutions, generate solver-executable programs, or both. However, current evaluations are misaligned with deployment in three ways: they (P1) fail to represent multimodal problem specifications, (P2) score outcomes only and cannot localize where failures occur along the modeling pipeline, and (P3) rarely report inference cost, obscuring reliability–cost trade-offs. We introduce Graph Optimization benchmark (GOBench), an aligned multimodal benchmark with solver-derived oracles and a four-layer diagnostic protocol that evaluates intermediate artifacts as well as end results, together with the Visual Inference Penalty (VIP) to measure multimodal overhead. Across frontier and open-weight models under paired text-only vs. T+V settings, we find that vision reliably increases inference cost, while its reliability impact is regime-dependent: frontier models often benefit from visual grounding, whereas several mid-tier/open models exhibit a Visual Paradox where vision reduces downstream executability and verification coverage. End-to-end success is frequently bottlenecked by intermediate-stage dropout; supervised fine-tuning on intermediate targets can mitigate this attrition in open models, enabling a reproducible harness for diagnosing failure modes and quantifying reliability–cost trade-offs.</abstract>
<identifier type="citekey">chen-etal-2026-gobench</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.306/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6144</start>
<end>6167</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GOBench: Stage-Wise Diagnostics and the Visual Paradox in Multimodal Graph Optimization
%A Chen, Yinghao
%A Xie, Wantong
%A Zeng, Shuli
%A Zhang, Sijia
%A Pan, Xiaotian
%A Wu, Feng
%A Li, Xiangyang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-gobench
%X Large language models (LLMs) and vision-language models (VLMs) are increasingly used as optimization assistants to produce solutions, generate solver-executable programs, or both. However, current evaluations are misaligned with deployment in three ways: they (P1) fail to represent multimodal problem specifications, (P2) score outcomes only and cannot localize where failures occur along the modeling pipeline, and (P3) rarely report inference cost, obscuring reliability–cost trade-offs. We introduce Graph Optimization benchmark (GOBench), an aligned multimodal benchmark with solver-derived oracles and a four-layer diagnostic protocol that evaluates intermediate artifacts as well as end results, together with the Visual Inference Penalty (VIP) to measure multimodal overhead. Across frontier and open-weight models under paired text-only vs. T+V settings, we find that vision reliably increases inference cost, while its reliability impact is regime-dependent: frontier models often benefit from visual grounding, whereas several mid-tier/open models exhibit a Visual Paradox where vision reduces downstream executability and verification coverage. End-to-end success is frequently bottlenecked by intermediate-stage dropout; supervised fine-tuning on intermediate targets can mitigate this attrition in open models, enabling a reproducible harness for diagnosing failure modes and quantifying reliability–cost trade-offs.
%U https://aclanthology.org/2026.findings-acl.306/
%P 6144-6167
Markdown (Informal)
[GOBench: Stage-Wise Diagnostics and the Visual Paradox in Multimodal Graph Optimization](https://aclanthology.org/2026.findings-acl.306/) (Chen et al., Findings 2026)
ACL
- Yinghao Chen, Wantong Xie, Shuli Zeng, Sijia Zhang, Xiaotian Pan, Feng Wu, and Xiangyang Li. 2026. GOBench: Stage-Wise Diagnostics and the Visual Paradox in Multimodal Graph Optimization. In Findings of the Association for Computational Linguistics: ACL 2026, pages 6144–6167, San Diego, California, United States. Association for Computational Linguistics.