@inproceedings{soni-etal-2026-know,
title = "Know What You See: Grounded localization of product components",
author = "Soni, Manan and
Kanagarajan, Abinesh and
Mohan, Shyam",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.75/",
pages = "1077--1088",
ISBN = "979-8-89176-394-4",
abstract = "Many real-world decisions about products (e.g. how they function, how they should be used) depend on their components rather than the object as a whole. Accurately identifying product component has applications like automated defect detection, visual spare-parts search, and verified assembly. However, existing object detectors treat components as isolated objects, ignoring their inherent structure. We propose Know What You See (KWYS), where we localize components by grounding them using a textual knowledge base (e.g., manuals or web descriptions). KWYS converts raw text into a hierarchical component taxonomy, which then guides an open-vocabulary object detector using a hierarchical verification algorithm. We evaluate on 1,000 product images across 5 diverse categories, improving component localization accuracy by 11{\%} along with reducing component hallucinations by 25{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="soni-etal-2026-know">
<titleInfo>
<title>Know What You See: Grounded localization of product components</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manan</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abinesh</namePart>
<namePart type="family">Kanagarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shyam</namePart>
<namePart type="family">Mohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Many real-world decisions about products (e.g. how they function, how they should be used) depend on their components rather than the object as a whole. Accurately identifying product component has applications like automated defect detection, visual spare-parts search, and verified assembly. However, existing object detectors treat components as isolated objects, ignoring their inherent structure. We propose Know What You See (KWYS), where we localize components by grounding them using a textual knowledge base (e.g., manuals or web descriptions). KWYS converts raw text into a hierarchical component taxonomy, which then guides an open-vocabulary object detector using a hierarchical verification algorithm. We evaluate on 1,000 product images across 5 diverse categories, improving component localization accuracy by 11% along with reducing component hallucinations by 25%.</abstract>
<identifier type="citekey">soni-etal-2026-know</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.75/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1077</start>
<end>1088</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Know What You See: Grounded localization of product components
%A Soni, Manan
%A Kanagarajan, Abinesh
%A Mohan, Shyam
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F soni-etal-2026-know
%X Many real-world decisions about products (e.g. how they function, how they should be used) depend on their components rather than the object as a whole. Accurately identifying product component has applications like automated defect detection, visual spare-parts search, and verified assembly. However, existing object detectors treat components as isolated objects, ignoring their inherent structure. We propose Know What You See (KWYS), where we localize components by grounding them using a textual knowledge base (e.g., manuals or web descriptions). KWYS converts raw text into a hierarchical component taxonomy, which then guides an open-vocabulary object detector using a hierarchical verification algorithm. We evaluate on 1,000 product images across 5 diverse categories, improving component localization accuracy by 11% along with reducing component hallucinations by 25%.
%U https://aclanthology.org/2026.acl-industry.75/
%P 1077-1088
Markdown (Informal)
[Know What You See: Grounded localization of product components](https://aclanthology.org/2026.acl-industry.75/) (Soni et al., ACL 2026)
ACL
- Manan Soni, Abinesh Kanagarajan, and Shyam Mohan. 2026. Know What You See: Grounded localization of product components. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1077–1088, San Diego, California, USA. Association for Computational Linguistics.