@inproceedings{gokhale-etal-2022-semantically,
title = "Semantically Distributed Robust Optimization for Vision-and-Language Inference",
author = "Gokhale, Tejas and
Chaudhary, Abhishek and
Banerjee, Pratyay and
Baral, Chitta and
Yang, Yezhou",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.118",
doi = "10.18653/v1/2022.findings-acl.118",
pages = "1493--1513",
abstract = "Analysis of vision-and-language models has revealed their brittleness under linguistic phenomena such as paraphrasing, negation, textual entailment, and word substitutions with synonyms or antonyms. While data augmentation techniques have been designed to mitigate against these failure modes, methods that can integrate this knowledge into the training pipeline remain under-explored. In this paper, we present \textbf{SDRO}, a model-agnostic method that utilizes a set linguistic transformations in a distributed robust optimization setting, along with an ensembling technique to leverage these transformations during inference.Experiments on benchmark datasets with images (NLVR$^2$) and video (VIOLIN) demonstrate performance improvements as well as robustness to adversarial attacks.Experiments on binary VQA explore the generalizability of this method to other V{\&}L tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gokhale-etal-2022-semantically">
<titleInfo>
<title>Semantically Distributed Robust Optimization for Vision-and-Language Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tejas</namePart>
<namePart type="family">Gokhale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Chaudhary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyay</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chitta</namePart>
<namePart type="family">Baral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yezhou</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Analysis of vision-and-language models has revealed their brittleness under linguistic phenomena such as paraphrasing, negation, textual entailment, and word substitutions with synonyms or antonyms. While data augmentation techniques have been designed to mitigate against these failure modes, methods that can integrate this knowledge into the training pipeline remain under-explored. In this paper, we present SDRO, a model-agnostic method that utilizes a set linguistic transformations in a distributed robust optimization setting, along with an ensembling technique to leverage these transformations during inference.Experiments on benchmark datasets with images (NLVR²) and video (VIOLIN) demonstrate performance improvements as well as robustness to adversarial attacks.Experiments on binary VQA explore the generalizability of this method to other V&L tasks.</abstract>
<identifier type="citekey">gokhale-etal-2022-semantically</identifier>
<identifier type="doi">10.18653/v1/2022.findings-acl.118</identifier>
<location>
<url>https://aclanthology.org/2022.findings-acl.118</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>1493</start>
<end>1513</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantically Distributed Robust Optimization for Vision-and-Language Inference
%A Gokhale, Tejas
%A Chaudhary, Abhishek
%A Banerjee, Pratyay
%A Baral, Chitta
%A Yang, Yezhou
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Findings of the Association for Computational Linguistics: ACL 2022
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F gokhale-etal-2022-semantically
%X Analysis of vision-and-language models has revealed their brittleness under linguistic phenomena such as paraphrasing, negation, textual entailment, and word substitutions with synonyms or antonyms. While data augmentation techniques have been designed to mitigate against these failure modes, methods that can integrate this knowledge into the training pipeline remain under-explored. In this paper, we present SDRO, a model-agnostic method that utilizes a set linguistic transformations in a distributed robust optimization setting, along with an ensembling technique to leverage these transformations during inference.Experiments on benchmark datasets with images (NLVR²) and video (VIOLIN) demonstrate performance improvements as well as robustness to adversarial attacks.Experiments on binary VQA explore the generalizability of this method to other V&L tasks.
%R 10.18653/v1/2022.findings-acl.118
%U https://aclanthology.org/2022.findings-acl.118
%U https://doi.org/10.18653/v1/2022.findings-acl.118
%P 1493-1513
Markdown (Informal)
[Semantically Distributed Robust Optimization for Vision-and-Language Inference](https://aclanthology.org/2022.findings-acl.118) (Gokhale et al., Findings 2022)
ACL