@inproceedings{rosch-etal-2024-enhancing,
title = "Enhancing Conceptual Understanding in Multimodal Contrastive Learning through Hard Negative Samples",
author = {R{\"o}sch, Philipp J. and
Oswald, Norbert and
Geierhos, Michaela and
Libovick{\'y}, Jind{\v{r}}ich},
editor = "Gu, Jing and
Fu, Tsu-Jui (Ray) and
Hudson, Drew and
Celikyilmaz, Asli and
Wang, William",
booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alvr-1.9/",
doi = "10.18653/v1/2024.alvr-1.9",
pages = "102--115",
abstract = "Current vision-language models leveraging contrastive learning often face limitations in developing fine-grained conceptual understanding. This is due to random negative samples during pretraining, causing almost exclusively very dissimilar concepts to be compared in the loss function. Consequently, the models struggle with fine-grained semantic differences. To address this problem, we introduce a novel pretraining method incorporating synthetic hard negative text examples. The hard negatives replace terms corresponding to visual concepts, leading to a more fine-grained visual and textual concept alignment. Further, we introduce InpaintCOCO, a new challenging dataset for assessing the fine-grained alignment of colors, objects, and sizes in vision-language models. We created the dataset using generative inpainting from COCO images by changing the visual concepts so that the images no longer match their original captions. Our results show significant improvements in fine-grained concept understanding across various vision-language datasets, including our InpaintCOCO dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rosch-etal-2024-enhancing">
<titleInfo>
<title>Enhancing Conceptual Understanding in Multimodal Contrastive Learning through Hard Negative Samples</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Rösch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Norbert</namePart>
<namePart type="family">Oswald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michaela</namePart>
<namePart type="family">Geierhos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindřich</namePart>
<namePart type="family">Libovický</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsu-Jui</namePart>
<namePart type="given">(Ray)</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Drew</namePart>
<namePart type="family">Hudson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asli</namePart>
<namePart type="family">Celikyilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current vision-language models leveraging contrastive learning often face limitations in developing fine-grained conceptual understanding. This is due to random negative samples during pretraining, causing almost exclusively very dissimilar concepts to be compared in the loss function. Consequently, the models struggle with fine-grained semantic differences. To address this problem, we introduce a novel pretraining method incorporating synthetic hard negative text examples. The hard negatives replace terms corresponding to visual concepts, leading to a more fine-grained visual and textual concept alignment. Further, we introduce InpaintCOCO, a new challenging dataset for assessing the fine-grained alignment of colors, objects, and sizes in vision-language models. We created the dataset using generative inpainting from COCO images by changing the visual concepts so that the images no longer match their original captions. Our results show significant improvements in fine-grained concept understanding across various vision-language datasets, including our InpaintCOCO dataset.</abstract>
<identifier type="citekey">rosch-etal-2024-enhancing</identifier>
<identifier type="doi">10.18653/v1/2024.alvr-1.9</identifier>
<location>
<url>https://aclanthology.org/2024.alvr-1.9/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>102</start>
<end>115</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Conceptual Understanding in Multimodal Contrastive Learning through Hard Negative Samples
%A Rösch, Philipp J.
%A Oswald, Norbert
%A Geierhos, Michaela
%A Libovický, Jindřich
%Y Gu, Jing
%Y Fu, Tsu-Jui (Ray)
%Y Hudson, Drew
%Y Celikyilmaz, Asli
%Y Wang, William
%S Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F rosch-etal-2024-enhancing
%X Current vision-language models leveraging contrastive learning often face limitations in developing fine-grained conceptual understanding. This is due to random negative samples during pretraining, causing almost exclusively very dissimilar concepts to be compared in the loss function. Consequently, the models struggle with fine-grained semantic differences. To address this problem, we introduce a novel pretraining method incorporating synthetic hard negative text examples. The hard negatives replace terms corresponding to visual concepts, leading to a more fine-grained visual and textual concept alignment. Further, we introduce InpaintCOCO, a new challenging dataset for assessing the fine-grained alignment of colors, objects, and sizes in vision-language models. We created the dataset using generative inpainting from COCO images by changing the visual concepts so that the images no longer match their original captions. Our results show significant improvements in fine-grained concept understanding across various vision-language datasets, including our InpaintCOCO dataset.
%R 10.18653/v1/2024.alvr-1.9
%U https://aclanthology.org/2024.alvr-1.9/
%U https://doi.org/10.18653/v1/2024.alvr-1.9
%P 102-115
Markdown (Informal)
[Enhancing Conceptual Understanding in Multimodal Contrastive Learning through Hard Negative Samples](https://aclanthology.org/2024.alvr-1.9/) (Rösch et al., ALVR 2024)
ACL