@inproceedings{prasanth-2026-efficient,
title = "Efficient Visual Grounding in {VQA} via Question-Guided Sparse Attention",
author = "Prasanth",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.alvr-main.24/",
pages = "260--271",
ISBN = "979-8-89176-398-2",
abstract = "Visual Question Answering (VQA) models process all image patches uniformlydespite questions typically requiring only a small subset of visual information.This inefficiency leads to unnecessary computation and can result in attentiondilution across irrelevant image regions. We propose \textbf{Question-GuidedSparse Attention (QGSA)}, a plug-and-play mechanism that dynamically selectsrelevant image patches conditioned on question semantics. Our approach introducesthree components: (1)a differentiable patch selector based on Gumbel-Softmaxreparameterisation that enables end-to-end training with hard patch selection atinference; (2)a self-supervised grounding loss that encourages spatialselectivity without bounding-box annotations, combining contrastive patchselection with patch{--}word alignment via a frozen CLIP encoder; and (3)anadaptive sparsity mechanism that adjusts the number of selected patches accordingto estimated question complexity. Experiments on SmolVLM-256M-Instruct andSmolVLM-500M-Instruct across three VQA benchmarks (VQA-RAD, A-OKVQA, RefCOCO)demonstrate that QGSA reduces cross-attention FLOPs by 91{--}99{\%} across inputresolutions, achieving up to $76\times$ theoretical speedup at 576px resolution, whilemaintaining \textit{exact} accuracy parity with the dense baseline ($\Delta=0.0$ ppon all datasets).Wall-clock parity with the dense baseline is reached at 336px; realisedend-to-end speedup requires larger models where cross-attention dominates totalcompute. QGSA consistently selects an average of $k\approx17$ patches out of576 (256M model), up to $k\approx18$ (500M model), yielding up to a $34\times$reduction in the visual token sequence. These small-scale results validate thefeasibility of question-conditioned sparse attention and provide a foundation forscaling to larger VLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prasanth-2026-efficient">
<titleInfo>
<title>Efficient Visual Grounding in VQA via Question-Guided Sparse Attention</title>
</titleInfo>
<name>
<namePart>Prasanth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qianqi</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parisa</namePart>
<namePart type="family">Kordjamshidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alane</namePart>
<namePart type="family">Suhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="given">Eric</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-398-2</identifier>
</relatedItem>
<abstract>Visual Question Answering (VQA) models process all image patches uniformlydespite questions typically requiring only a small subset of visual information.This inefficiency leads to unnecessary computation and can result in attentiondilution across irrelevant image regions. We propose Question-GuidedSparse Attention (QGSA), a plug-and-play mechanism that dynamically selectsrelevant image patches conditioned on question semantics. Our approach introducesthree components: (1)a differentiable patch selector based on Gumbel-Softmaxreparameterisation that enables end-to-end training with hard patch selection atinference; (2)a self-supervised grounding loss that encourages spatialselectivity without bounding-box annotations, combining contrastive patchselection with patch–word alignment via a frozen CLIP encoder; and (3)anadaptive sparsity mechanism that adjusts the number of selected patches accordingto estimated question complexity. Experiments on SmolVLM-256M-Instruct andSmolVLM-500M-Instruct across three VQA benchmarks (VQA-RAD, A-OKVQA, RefCOCO)demonstrate that QGSA reduces cross-attention FLOPs by 91–99% across inputresolutions, achieving up to 76\times theoretical speedup at 576px resolution, whilemaintaining exact accuracy parity with the dense baseline (Δ=0.0 ppon all datasets).Wall-clock parity with the dense baseline is reached at 336px; realisedend-to-end speedup requires larger models where cross-attention dominates totalcompute. QGSA consistently selects an average of k\approx17 patches out of576 (256M model), up to k\approx18 (500M model), yielding up to a 34\timesreduction in the visual token sequence. These small-scale results validate thefeasibility of question-conditioned sparse attention and provide a foundation forscaling to larger VLMs.</abstract>
<identifier type="citekey">prasanth-2026-efficient</identifier>
<location>
<url>https://aclanthology.org/2026.alvr-main.24/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>260</start>
<end>271</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Visual Grounding in VQA via Question-Guided Sparse Attention
%Y Yan, Qianqi
%Y Montariol, Syrielle
%Y Fan, Yue
%Y Gu, Jing
%Y Pan, Jiayi
%Y Li, Manling
%Y Kordjamshidi, Parisa
%Y Suhr, Alane
%Y Wang, Xin Eric
%A Prasanth
%S Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-398-2
%F prasanth-2026-efficient
%X Visual Question Answering (VQA) models process all image patches uniformlydespite questions typically requiring only a small subset of visual information.This inefficiency leads to unnecessary computation and can result in attentiondilution across irrelevant image regions. We propose Question-GuidedSparse Attention (QGSA), a plug-and-play mechanism that dynamically selectsrelevant image patches conditioned on question semantics. Our approach introducesthree components: (1)a differentiable patch selector based on Gumbel-Softmaxreparameterisation that enables end-to-end training with hard patch selection atinference; (2)a self-supervised grounding loss that encourages spatialselectivity without bounding-box annotations, combining contrastive patchselection with patch–word alignment via a frozen CLIP encoder; and (3)anadaptive sparsity mechanism that adjusts the number of selected patches accordingto estimated question complexity. Experiments on SmolVLM-256M-Instruct andSmolVLM-500M-Instruct across three VQA benchmarks (VQA-RAD, A-OKVQA, RefCOCO)demonstrate that QGSA reduces cross-attention FLOPs by 91–99% across inputresolutions, achieving up to 76\times theoretical speedup at 576px resolution, whilemaintaining exact accuracy parity with the dense baseline (Δ=0.0 ppon all datasets).Wall-clock parity with the dense baseline is reached at 336px; realisedend-to-end speedup requires larger models where cross-attention dominates totalcompute. QGSA consistently selects an average of k\approx17 patches out of576 (256M model), up to k\approx18 (500M model), yielding up to a 34\timesreduction in the visual token sequence. These small-scale results validate thefeasibility of question-conditioned sparse attention and provide a foundation forscaling to larger VLMs.
%U https://aclanthology.org/2026.alvr-main.24/
%P 260-271
Markdown (Informal)
[Efficient Visual Grounding in VQA via Question-Guided Sparse Attention](https://aclanthology.org/2026.alvr-main.24/) (Prasanth, ALVR 2026)
ACL