@inproceedings{sahay-etal-2026-mirage,
title = "{MIRAGE}: Metadata-guided Image Retrieval and Answer Generation for {E}-commerce Troubleshooting",
author = "Sahay, Rishav and
Tekumalla, Lavanya Sita and
Saladi, Anoop",
editor = {Matusevych, Yevgen and
Eryi{\u{g}}it, G{\"u}l{\c{s}}en and
Aletras, Nikolaos},
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 5: Industry Track)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-industry.56/",
pages = "764--776",
ISBN = "979-8-89176-384-5",
abstract = "Existing multimodal systems typically associate text and available images based on embedding similarity or simple co-location, but such approaches often fail to ensure that the linked image accurately depicts the specific product or component mentioned in a troubleshooting instruction. We introduce MIRAGE, a metadata-first paradigm that treats structured metadata, (not raw pixels), as a first-class modality for multimodal grounding. In MIRAGE, both text and images are projected through a shared semantic schema capturing product attributes, context, and visual aspects, enabling reasoning over interpretable attributes for troubleshooting rather than unstructured embeddings. MIRAGE comprises of three complementary modules: M-Link for schema-guided image{--}text linking, M-Gen for metadata-conditioned multimodal generation, and M-Eval for consistency evaluation in the same structured space. Experiments on large-scale enterprise e-commerce troubleshooting data across 10 product types on 100K text chunks and 35K images show that metadata-centric grounding achieves over 40{\%} higher linking coverage of high-quality visual content and over 45{\%} in linking and response quality than embedding-based baselines. MIRAGE demonstrates the potential of structured metadata in enabling scalable, fine-grained grounding in multimodal troubleshooting systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sahay-etal-2026-mirage">
<titleInfo>
<title>MIRAGE: Metadata-guided Image Retrieval and Answer Generation for E-commerce Troubleshooting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rishav</namePart>
<namePart type="family">Sahay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lavanya</namePart>
<namePart type="given">Sita</namePart>
<namePart type="family">Tekumalla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Saladi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 5: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yevgen</namePart>
<namePart type="family">Matusevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gülşen</namePart>
<namePart type="family">Eryiğit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-384-5</identifier>
</relatedItem>
<abstract>Existing multimodal systems typically associate text and available images based on embedding similarity or simple co-location, but such approaches often fail to ensure that the linked image accurately depicts the specific product or component mentioned in a troubleshooting instruction. We introduce MIRAGE, a metadata-first paradigm that treats structured metadata, (not raw pixels), as a first-class modality for multimodal grounding. In MIRAGE, both text and images are projected through a shared semantic schema capturing product attributes, context, and visual aspects, enabling reasoning over interpretable attributes for troubleshooting rather than unstructured embeddings. MIRAGE comprises of three complementary modules: M-Link for schema-guided image–text linking, M-Gen for metadata-conditioned multimodal generation, and M-Eval for consistency evaluation in the same structured space. Experiments on large-scale enterprise e-commerce troubleshooting data across 10 product types on 100K text chunks and 35K images show that metadata-centric grounding achieves over 40% higher linking coverage of high-quality visual content and over 45% in linking and response quality than embedding-based baselines. MIRAGE demonstrates the potential of structured metadata in enabling scalable, fine-grained grounding in multimodal troubleshooting systems.</abstract>
<identifier type="citekey">sahay-etal-2026-mirage</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-industry.56/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>764</start>
<end>776</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MIRAGE: Metadata-guided Image Retrieval and Answer Generation for E-commerce Troubleshooting
%A Sahay, Rishav
%A Tekumalla, Lavanya Sita
%A Saladi, Anoop
%Y Matusevych, Yevgen
%Y Eryiğit, Gülşen
%Y Aletras, Nikolaos
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 5: Industry Track)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-384-5
%F sahay-etal-2026-mirage
%X Existing multimodal systems typically associate text and available images based on embedding similarity or simple co-location, but such approaches often fail to ensure that the linked image accurately depicts the specific product or component mentioned in a troubleshooting instruction. We introduce MIRAGE, a metadata-first paradigm that treats structured metadata, (not raw pixels), as a first-class modality for multimodal grounding. In MIRAGE, both text and images are projected through a shared semantic schema capturing product attributes, context, and visual aspects, enabling reasoning over interpretable attributes for troubleshooting rather than unstructured embeddings. MIRAGE comprises of three complementary modules: M-Link for schema-guided image–text linking, M-Gen for metadata-conditioned multimodal generation, and M-Eval for consistency evaluation in the same structured space. Experiments on large-scale enterprise e-commerce troubleshooting data across 10 product types on 100K text chunks and 35K images show that metadata-centric grounding achieves over 40% higher linking coverage of high-quality visual content and over 45% in linking and response quality than embedding-based baselines. MIRAGE demonstrates the potential of structured metadata in enabling scalable, fine-grained grounding in multimodal troubleshooting systems.
%U https://aclanthology.org/2026.eacl-industry.56/
%P 764-776
Markdown (Informal)
[MIRAGE: Metadata-guided Image Retrieval and Answer Generation for E-commerce Troubleshooting](https://aclanthology.org/2026.eacl-industry.56/) (Sahay et al., EACL 2026)
ACL