@inproceedings{li-ma-2026-aim,
title = "{AIM}-{C}o{T}: Active Information-driven Multimodal Chain-of-Thought for Vision-Language Reasoning",
author = "Li, Xiping and
Ma, Jianghong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1227/",
doi = "10.18653/v1/2026.acl-long.1227",
pages = "26656--26681",
ISBN = "979-8-89176-390-6",
abstract = "Interleaved-Modal Chain-of-Thought (I-MCoT) advances vision-language reasoning, such as Visual Question Answering (VQA). This paradigm integrates specially selected visual evidence from the input image into the context of Vision-Language Models (VLMs), enabling them to ground their reasoning logic in these details. Accordingly, the efficacy of an I-MCoT framework relies on identifying *what* to see (evidence selection) and *when* to see it (triggering of insertions). However, existing methods fall short in both aspects. First, for selection, they rely on attention signals, which are unreliable{---}particularly under severe granularity imbalance between the brief textual query and the informative image. Second, for triggering, they adopt static triggers, which fail to capture the VLMs' dynamic needs for visual evidence. To this end, we propose a novel I-MCoT framework, **A**ctive **I**nformation-driven **M**ulti-modal **C**hain-**o**f-**T**hought (**AIM-CoT**), which aims to improve both evidence selection and insertion triggering via: (1) **Context-enhanced Attention-map Generation (CAG)** to mitigate granularity imbalance via textual context enhancement; (2) **Active Visual Probing (AVP)** to proactively select the most informative evidence via an information foraging process; and (3) **Dynamic Attention-shift Trigger (DAT)** to precisely activate insertions when VLM{'}s attention shifts from text to visual context. Experiments across three benchmarks and four backbones demonstrate AIM-CoT{'}s consistent superiority. Our code is available at https://anonymous.4open.science/r/AIMCoT."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-ma-2026-aim">
<titleInfo>
<title>AIM-CoT: Active Information-driven Multimodal Chain-of-Thought for Vision-Language Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiping</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianghong</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Interleaved-Modal Chain-of-Thought (I-MCoT) advances vision-language reasoning, such as Visual Question Answering (VQA). This paradigm integrates specially selected visual evidence from the input image into the context of Vision-Language Models (VLMs), enabling them to ground their reasoning logic in these details. Accordingly, the efficacy of an I-MCoT framework relies on identifying *what* to see (evidence selection) and *when* to see it (triggering of insertions). However, existing methods fall short in both aspects. First, for selection, they rely on attention signals, which are unreliable—particularly under severe granularity imbalance between the brief textual query and the informative image. Second, for triggering, they adopt static triggers, which fail to capture the VLMs’ dynamic needs for visual evidence. To this end, we propose a novel I-MCoT framework, **A**ctive **I**nformation-driven **M**ulti-modal **C**hain-**o**f-**T**hought (**AIM-CoT**), which aims to improve both evidence selection and insertion triggering via: (1) **Context-enhanced Attention-map Generation (CAG)** to mitigate granularity imbalance via textual context enhancement; (2) **Active Visual Probing (AVP)** to proactively select the most informative evidence via an information foraging process; and (3) **Dynamic Attention-shift Trigger (DAT)** to precisely activate insertions when VLM’s attention shifts from text to visual context. Experiments across three benchmarks and four backbones demonstrate AIM-CoT’s consistent superiority. Our code is available at https://anonymous.4open.science/r/AIMCoT.</abstract>
<identifier type="citekey">li-ma-2026-aim</identifier>
<identifier type="doi">10.18653/v1/2026.acl-long.1227</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1227/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>26656</start>
<end>26681</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AIM-CoT: Active Information-driven Multimodal Chain-of-Thought for Vision-Language Reasoning
%A Li, Xiping
%A Ma, Jianghong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F li-ma-2026-aim
%X Interleaved-Modal Chain-of-Thought (I-MCoT) advances vision-language reasoning, such as Visual Question Answering (VQA). This paradigm integrates specially selected visual evidence from the input image into the context of Vision-Language Models (VLMs), enabling them to ground their reasoning logic in these details. Accordingly, the efficacy of an I-MCoT framework relies on identifying *what* to see (evidence selection) and *when* to see it (triggering of insertions). However, existing methods fall short in both aspects. First, for selection, they rely on attention signals, which are unreliable—particularly under severe granularity imbalance between the brief textual query and the informative image. Second, for triggering, they adopt static triggers, which fail to capture the VLMs’ dynamic needs for visual evidence. To this end, we propose a novel I-MCoT framework, **A**ctive **I**nformation-driven **M**ulti-modal **C**hain-**o**f-**T**hought (**AIM-CoT**), which aims to improve both evidence selection and insertion triggering via: (1) **Context-enhanced Attention-map Generation (CAG)** to mitigate granularity imbalance via textual context enhancement; (2) **Active Visual Probing (AVP)** to proactively select the most informative evidence via an information foraging process; and (3) **Dynamic Attention-shift Trigger (DAT)** to precisely activate insertions when VLM’s attention shifts from text to visual context. Experiments across three benchmarks and four backbones demonstrate AIM-CoT’s consistent superiority. Our code is available at https://anonymous.4open.science/r/AIMCoT.
%R 10.18653/v1/2026.acl-long.1227
%U https://aclanthology.org/2026.acl-long.1227/
%U https://doi.org/10.18653/v1/2026.acl-long.1227
%P 26656-26681
Markdown (Informal)
[AIM-CoT: Active Information-driven Multimodal Chain-of-Thought for Vision-Language Reasoning](https://aclanthology.org/2026.acl-long.1227/) (Li & Ma, ACL 2026)
ACL