@inproceedings{tran-etal-2022-contrastive,
title = "Contrastive Visual and Language Learning for Visual Relationship Detection",
author = "Tran, Thanh and
Neau, Maelic and
Santos, Paulo and
Powers, David",
editor = "Parameswaran, Pradeesh and
Biggs, Jennifer and
Powers, David",
booktitle = "Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2022",
address = "Adelaide, Australia",
publisher = "Australasian Language Technology Association",
url = "https://aclanthology.org/2022.alta-1.23",
pages = "170--177",
abstract = "Visual Relationship Detection aims to understand real-world objects{'} interactions by grounding visual concepts to compositional visual relation triples, written in the form of (subject, predicate, object). Previous works have explored the use of contrastive learning to implicitly predict the predicates from the relevant image regions. However, these models often directly leverage in-distribution spatial and language co-occurrences biases during training, preventing the models from generalizing to out-of-distribution compositions. In this work, we examine whether contrastive vision and language models pre-trained on large-scale external image and text dataset can assist the detection of compositional visual relationships. To this end, we propose a semi-supervised contrastive fine-tuning approach for the visual relationship detection task. The results show that fine-tuned models that were pre-trained on larger datasets do not yield better performance when performing visual relationship detection, and larger models can yield lower performance when compared with their smaller counterparts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tran-etal-2022-contrastive">
<titleInfo>
<title>Contrastive Visual and Language Learning for Visual Relationship Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thanh</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maelic</namePart>
<namePart type="family">Neau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paulo</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Powers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pradeesh</namePart>
<namePart type="family">Parameswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Biggs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Powers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Australasian Language Technology Association</publisher>
<place>
<placeTerm type="text">Adelaide, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual Relationship Detection aims to understand real-world objects’ interactions by grounding visual concepts to compositional visual relation triples, written in the form of (subject, predicate, object). Previous works have explored the use of contrastive learning to implicitly predict the predicates from the relevant image regions. However, these models often directly leverage in-distribution spatial and language co-occurrences biases during training, preventing the models from generalizing to out-of-distribution compositions. In this work, we examine whether contrastive vision and language models pre-trained on large-scale external image and text dataset can assist the detection of compositional visual relationships. To this end, we propose a semi-supervised contrastive fine-tuning approach for the visual relationship detection task. The results show that fine-tuned models that were pre-trained on larger datasets do not yield better performance when performing visual relationship detection, and larger models can yield lower performance when compared with their smaller counterparts.</abstract>
<identifier type="citekey">tran-etal-2022-contrastive</identifier>
<location>
<url>https://aclanthology.org/2022.alta-1.23</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>170</start>
<end>177</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Contrastive Visual and Language Learning for Visual Relationship Detection
%A Tran, Thanh
%A Neau, Maelic
%A Santos, Paulo
%A Powers, David
%Y Parameswaran, Pradeesh
%Y Biggs, Jennifer
%Y Powers, David
%S Proceedings of the 20th Annual Workshop of the Australasian Language Technology Association
%D 2022
%8 December
%I Australasian Language Technology Association
%C Adelaide, Australia
%F tran-etal-2022-contrastive
%X Visual Relationship Detection aims to understand real-world objects’ interactions by grounding visual concepts to compositional visual relation triples, written in the form of (subject, predicate, object). Previous works have explored the use of contrastive learning to implicitly predict the predicates from the relevant image regions. However, these models often directly leverage in-distribution spatial and language co-occurrences biases during training, preventing the models from generalizing to out-of-distribution compositions. In this work, we examine whether contrastive vision and language models pre-trained on large-scale external image and text dataset can assist the detection of compositional visual relationships. To this end, we propose a semi-supervised contrastive fine-tuning approach for the visual relationship detection task. The results show that fine-tuned models that were pre-trained on larger datasets do not yield better performance when performing visual relationship detection, and larger models can yield lower performance when compared with their smaller counterparts.
%U https://aclanthology.org/2022.alta-1.23
%P 170-177
Markdown (Informal)
[Contrastive Visual and Language Learning for Visual Relationship Detection](https://aclanthology.org/2022.alta-1.23) (Tran et al., ALTA 2022)
ACL