@inproceedings{nebbia-kovashka-2024-synonym,
title = "Synonym relations affect object detection learned on vision-language data",
author = "Nebbia, Giacomo and
Kovashka, Adriana",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.239",
doi = "10.18653/v1/2024.findings-naacl.239",
pages = "3770--3776",
abstract = "We analyze whether object detectors trained on vision-language data learn effective visual representations for synonyms. Since many current vision-language models accept user-provided textual input, we highlight the need for such models to learn feature representations that are robust to changes in how such input is provided. Specifically, we analyze changes in synonyms used to refer to objects. Here, we study object detectors trained on vision-language data and investigate how to make their performance less dependent on whether synonyms are used to refer to an object. We propose two approaches to achieve this goal: data augmentation by back-translation and class embedding enrichment. We show the promise of such approaches, reporting improved performance on synonyms from mAP@0.5=33.87{\%} to 37.93{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nebbia-kovashka-2024-synonym">
<titleInfo>
<title>Synonym relations affect object detection learned on vision-language data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giacomo</namePart>
<namePart type="family">Nebbia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adriana</namePart>
<namePart type="family">Kovashka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We analyze whether object detectors trained on vision-language data learn effective visual representations for synonyms. Since many current vision-language models accept user-provided textual input, we highlight the need for such models to learn feature representations that are robust to changes in how such input is provided. Specifically, we analyze changes in synonyms used to refer to objects. Here, we study object detectors trained on vision-language data and investigate how to make their performance less dependent on whether synonyms are used to refer to an object. We propose two approaches to achieve this goal: data augmentation by back-translation and class embedding enrichment. We show the promise of such approaches, reporting improved performance on synonyms from mAP@0.5=33.87% to 37.93%.</abstract>
<identifier type="citekey">nebbia-kovashka-2024-synonym</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.239</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.239</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3770</start>
<end>3776</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synonym relations affect object detection learned on vision-language data
%A Nebbia, Giacomo
%A Kovashka, Adriana
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nebbia-kovashka-2024-synonym
%X We analyze whether object detectors trained on vision-language data learn effective visual representations for synonyms. Since many current vision-language models accept user-provided textual input, we highlight the need for such models to learn feature representations that are robust to changes in how such input is provided. Specifically, we analyze changes in synonyms used to refer to objects. Here, we study object detectors trained on vision-language data and investigate how to make their performance less dependent on whether synonyms are used to refer to an object. We propose two approaches to achieve this goal: data augmentation by back-translation and class embedding enrichment. We show the promise of such approaches, reporting improved performance on synonyms from mAP@0.5=33.87% to 37.93%.
%R 10.18653/v1/2024.findings-naacl.239
%U https://aclanthology.org/2024.findings-naacl.239
%U https://doi.org/10.18653/v1/2024.findings-naacl.239
%P 3770-3776
Markdown (Informal)
[Synonym relations affect object detection learned on vision-language data](https://aclanthology.org/2024.findings-naacl.239) (Nebbia & Kovashka, Findings 2024)
ACL