@inproceedings{cirik-etal-2022-holm,
title = "{HOLM}: Hallucinating Objects with Language Models for Referring Expression Recognition in Partially-Observed Scenes",
author = "Cirik, Volkan and
Morency, Louis-Philippe and
Berg-Kirkpatrick, Taylor",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.373/",
doi = "10.18653/v1/2022.acl-long.373",
pages = "5440--5453",
abstract = "AI systems embodied in the physical world face a fundamental challenge of partial observability; operating with only a limited view and knowledge of the environment. This creates challenges when AI systems try to reason about language and its relationship with the environment: objects referred to through language (e.g. giving many instructions) are not immediately visible. Actions by the AI system may be required to bring these objects in view. A good benchmark to study this challenge is Dynamic Referring Expression Recognition (dRER) task, where the goal is to find a target location by dynamically adjusting the field of view (FoV) in a partially observed 360 scenes. In this paper, we introduce HOLM, Hallucinating Objects with Language Models, to address the challenge of partial observability. HOLM uses large pre-trained language models (LMs) to infer object hallucinations for the unobserved part of the environment. Our core intuition is that if a pair of objects co-appear in an environment frequently, our usage of language should reflect this fact about the world. Based on this intuition, we prompt language models to extract knowledge about object affinities which gives us a proxy for spatial relationships of objects. Our experiments show that HOLM performs better than the state-of-the-art approaches on two datasets for dRER; allowing to study generalization for both indoor and outdoor settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cirik-etal-2022-holm">
<titleInfo>
<title>HOLM: Hallucinating Objects with Language Models for Referring Expression Recognition in Partially-Observed Scenes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Volkan</namePart>
<namePart type="family">Cirik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taylor</namePart>
<namePart type="family">Berg-Kirkpatrick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>AI systems embodied in the physical world face a fundamental challenge of partial observability; operating with only a limited view and knowledge of the environment. This creates challenges when AI systems try to reason about language and its relationship with the environment: objects referred to through language (e.g. giving many instructions) are not immediately visible. Actions by the AI system may be required to bring these objects in view. A good benchmark to study this challenge is Dynamic Referring Expression Recognition (dRER) task, where the goal is to find a target location by dynamically adjusting the field of view (FoV) in a partially observed 360 scenes. In this paper, we introduce HOLM, Hallucinating Objects with Language Models, to address the challenge of partial observability. HOLM uses large pre-trained language models (LMs) to infer object hallucinations for the unobserved part of the environment. Our core intuition is that if a pair of objects co-appear in an environment frequently, our usage of language should reflect this fact about the world. Based on this intuition, we prompt language models to extract knowledge about object affinities which gives us a proxy for spatial relationships of objects. Our experiments show that HOLM performs better than the state-of-the-art approaches on two datasets for dRER; allowing to study generalization for both indoor and outdoor settings.</abstract>
<identifier type="citekey">cirik-etal-2022-holm</identifier>
<identifier type="doi">10.18653/v1/2022.acl-long.373</identifier>
<location>
<url>https://aclanthology.org/2022.acl-long.373/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>5440</start>
<end>5453</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HOLM: Hallucinating Objects with Language Models for Referring Expression Recognition in Partially-Observed Scenes
%A Cirik, Volkan
%A Morency, Louis-Philippe
%A Berg-Kirkpatrick, Taylor
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F cirik-etal-2022-holm
%X AI systems embodied in the physical world face a fundamental challenge of partial observability; operating with only a limited view and knowledge of the environment. This creates challenges when AI systems try to reason about language and its relationship with the environment: objects referred to through language (e.g. giving many instructions) are not immediately visible. Actions by the AI system may be required to bring these objects in view. A good benchmark to study this challenge is Dynamic Referring Expression Recognition (dRER) task, where the goal is to find a target location by dynamically adjusting the field of view (FoV) in a partially observed 360 scenes. In this paper, we introduce HOLM, Hallucinating Objects with Language Models, to address the challenge of partial observability. HOLM uses large pre-trained language models (LMs) to infer object hallucinations for the unobserved part of the environment. Our core intuition is that if a pair of objects co-appear in an environment frequently, our usage of language should reflect this fact about the world. Based on this intuition, we prompt language models to extract knowledge about object affinities which gives us a proxy for spatial relationships of objects. Our experiments show that HOLM performs better than the state-of-the-art approaches on two datasets for dRER; allowing to study generalization for both indoor and outdoor settings.
%R 10.18653/v1/2022.acl-long.373
%U https://aclanthology.org/2022.acl-long.373/
%U https://doi.org/10.18653/v1/2022.acl-long.373
%P 5440-5453
Markdown (Informal)
[HOLM: Hallucinating Objects with Language Models for Referring Expression Recognition in Partially-Observed Scenes](https://aclanthology.org/2022.acl-long.373/) (Cirik et al., ACL 2022)
ACL