@inproceedings{yu-etal-2025-isr,
title = "{ISR}: Self-Refining Referring Expressions for Entity Grounding",
author = "Yu, Zhuocheng and
Zhao, Bingchan and
Song, Yifan and
Li, Sujian and
He, Zhonghui",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1483/",
doi = "10.18653/v1/2025.acl-long.1483",
pages = "30702--30714",
ISBN = "979-8-89176-251-0",
abstract = "Entity grounding, a crucial task in constructing multimodal knowledge graphs, aims to align entities from knowledge graphs with their corresponding images. Unlike conventional visual grounding tasks that use referring expressions (REs) as inputs, entity grounding relies solely on entity names and types, presenting a significant challenge. To address this, we introduce a novel **I**terative **S**elf-**R**efinement (**ISR**) scheme to enhance the multimodal large language model{'}s capability to generate high quality REs for the given entities as explicit contextual clues. This training scheme, inspired by human learning dynamics and human annotation processes, enables the MLLM to iteratively generate and refine REs by learning from successes and failures, guided by outcome rewards from a visual grounding model. This iterative cycle of self-refinement avoids overfitting to fixed annotations and fosters continued improvement in referring expression generation. Extensive experiments demonstrate that our methods surpasses other methods in entity grounding, highlighting its effectiveness, robustness and potential for broader applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2025-isr">
<titleInfo>
<title>ISR: Self-Refining Referring Expressions for Entity Grounding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhuocheng</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bingchan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhonghui</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Entity grounding, a crucial task in constructing multimodal knowledge graphs, aims to align entities from knowledge graphs with their corresponding images. Unlike conventional visual grounding tasks that use referring expressions (REs) as inputs, entity grounding relies solely on entity names and types, presenting a significant challenge. To address this, we introduce a novel **I**terative **S**elf-**R**efinement (**ISR**) scheme to enhance the multimodal large language model’s capability to generate high quality REs for the given entities as explicit contextual clues. This training scheme, inspired by human learning dynamics and human annotation processes, enables the MLLM to iteratively generate and refine REs by learning from successes and failures, guided by outcome rewards from a visual grounding model. This iterative cycle of self-refinement avoids overfitting to fixed annotations and fosters continued improvement in referring expression generation. Extensive experiments demonstrate that our methods surpasses other methods in entity grounding, highlighting its effectiveness, robustness and potential for broader applications.</abstract>
<identifier type="citekey">yu-etal-2025-isr</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1483</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1483/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>30702</start>
<end>30714</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ISR: Self-Refining Referring Expressions for Entity Grounding
%A Yu, Zhuocheng
%A Zhao, Bingchan
%A Song, Yifan
%A Li, Sujian
%A He, Zhonghui
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F yu-etal-2025-isr
%X Entity grounding, a crucial task in constructing multimodal knowledge graphs, aims to align entities from knowledge graphs with their corresponding images. Unlike conventional visual grounding tasks that use referring expressions (REs) as inputs, entity grounding relies solely on entity names and types, presenting a significant challenge. To address this, we introduce a novel **I**terative **S**elf-**R**efinement (**ISR**) scheme to enhance the multimodal large language model’s capability to generate high quality REs for the given entities as explicit contextual clues. This training scheme, inspired by human learning dynamics and human annotation processes, enables the MLLM to iteratively generate and refine REs by learning from successes and failures, guided by outcome rewards from a visual grounding model. This iterative cycle of self-refinement avoids overfitting to fixed annotations and fosters continued improvement in referring expression generation. Extensive experiments demonstrate that our methods surpasses other methods in entity grounding, highlighting its effectiveness, robustness and potential for broader applications.
%R 10.18653/v1/2025.acl-long.1483
%U https://aclanthology.org/2025.acl-long.1483/
%U https://doi.org/10.18653/v1/2025.acl-long.1483
%P 30702-30714
Markdown (Informal)
[ISR: Self-Refining Referring Expressions for Entity Grounding](https://aclanthology.org/2025.acl-long.1483/) (Yu et al., ACL 2025)
ACL
- Zhuocheng Yu, Bingchan Zhao, Yifan Song, Sujian Li, and Zhonghui He. 2025. ISR: Self-Refining Referring Expressions for Entity Grounding. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 30702–30714, Vienna, Austria. Association for Computational Linguistics.