@inproceedings{oguz-etal-2023-find,
title = "Find-2-Find: Multitask Learning for Anaphora Resolution and Object Localization",
author = "Oguz, Cennet and
Denis, Pascal and
Vincent, Emmanuel and
Ostermann, Simon and
van Genabith, Josef",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.504",
doi = "10.18653/v1/2023.emnlp-main.504",
pages = "8099--8110",
abstract = "In multimodal understanding tasks, visual and linguistic ambiguities can arise. Visual ambiguity can occur when visual objects require a model to ground a referring expression in a video without strong supervision, while linguistic ambiguity can occur from changes in entities in action flows. As an example from the cooking domain, {``}oil{''} mixed with {``}salt{''} and {``}pepper{''} could later be referred to as a {``}mixture{''}. Without a clear visual-linguistic alignment, we cannot know which among several objects shown is referred to by the language expression {``}mixture{''}, and without resolved antecedents, we cannot pinpoint what the mixture is. We define this chicken-and-egg problem as Visual-linguistic Ambiguity. In this paper, we present Find2Find, a joint anaphora resolution and object localization dataset targeting the problem of \textit{visual-linguistic ambiguity}, consisting of 500 anaphora-annotated recipes with corresponding videos. We present experimental results of a novel end-to-end joint multitask learning framework for Find2Find that fuses visual and textual information and shows improvements both for anaphora resolution and object localization with one joint model in multitask learning, as compared to a strong single-task baseline.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oguz-etal-2023-find">
<titleInfo>
<title>Find-2-Find: Multitask Learning for Anaphora Resolution and Object Localization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cennet</namePart>
<namePart type="family">Oguz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pascal</namePart>
<namePart type="family">Denis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Vincent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Ostermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In multimodal understanding tasks, visual and linguistic ambiguities can arise. Visual ambiguity can occur when visual objects require a model to ground a referring expression in a video without strong supervision, while linguistic ambiguity can occur from changes in entities in action flows. As an example from the cooking domain, “oil” mixed with “salt” and “pepper” could later be referred to as a “mixture”. Without a clear visual-linguistic alignment, we cannot know which among several objects shown is referred to by the language expression “mixture”, and without resolved antecedents, we cannot pinpoint what the mixture is. We define this chicken-and-egg problem as Visual-linguistic Ambiguity. In this paper, we present Find2Find, a joint anaphora resolution and object localization dataset targeting the problem of visual-linguistic ambiguity, consisting of 500 anaphora-annotated recipes with corresponding videos. We present experimental results of a novel end-to-end joint multitask learning framework for Find2Find that fuses visual and textual information and shows improvements both for anaphora resolution and object localization with one joint model in multitask learning, as compared to a strong single-task baseline.</abstract>
<identifier type="citekey">oguz-etal-2023-find</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.504</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.504</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>8099</start>
<end>8110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Find-2-Find: Multitask Learning for Anaphora Resolution and Object Localization
%A Oguz, Cennet
%A Denis, Pascal
%A Vincent, Emmanuel
%A Ostermann, Simon
%A van Genabith, Josef
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F oguz-etal-2023-find
%X In multimodal understanding tasks, visual and linguistic ambiguities can arise. Visual ambiguity can occur when visual objects require a model to ground a referring expression in a video without strong supervision, while linguistic ambiguity can occur from changes in entities in action flows. As an example from the cooking domain, “oil” mixed with “salt” and “pepper” could later be referred to as a “mixture”. Without a clear visual-linguistic alignment, we cannot know which among several objects shown is referred to by the language expression “mixture”, and without resolved antecedents, we cannot pinpoint what the mixture is. We define this chicken-and-egg problem as Visual-linguistic Ambiguity. In this paper, we present Find2Find, a joint anaphora resolution and object localization dataset targeting the problem of visual-linguistic ambiguity, consisting of 500 anaphora-annotated recipes with corresponding videos. We present experimental results of a novel end-to-end joint multitask learning framework for Find2Find that fuses visual and textual information and shows improvements both for anaphora resolution and object localization with one joint model in multitask learning, as compared to a strong single-task baseline.
%R 10.18653/v1/2023.emnlp-main.504
%U https://aclanthology.org/2023.emnlp-main.504
%U https://doi.org/10.18653/v1/2023.emnlp-main.504
%P 8099-8110
Markdown (Informal)
[Find-2-Find: Multitask Learning for Anaphora Resolution and Object Localization](https://aclanthology.org/2023.emnlp-main.504) (Oguz et al., EMNLP 2023)
ACL