@inproceedings{takmaz-etal-2022-less,
title = "Less Descriptive yet Discriminative: Quantifying the Properties of Multimodal Referring Utterances via {CLIP}",
author = "Takmaz, Ece and
Pezzelle, Sandro and
Fern{\'a}ndez, Raquel",
editor = "Chersoni, Emmanuele and
Hollenstein, Nora and
Jacobs, Cassandra and
Oseki, Yohei and
Pr{\'e}vot, Laurent and
Santus, Enrico",
booktitle = "Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.cmcl-1.4",
doi = "10.18653/v1/2022.cmcl-1.4",
pages = "36--42",
abstract = "In this work, we use a transformer-based pre-trained multimodal model, CLIP, to shed light on the mechanisms employed by human speakers when referring to visual entities. In particular, we use CLIP to quantify the degree of descriptiveness (how well an utterance describes an image in isolation) and discriminativeness (to what extent an utterance is effective in picking out a single image among similar images) of human referring utterances within multimodal dialogues. Overall, our results show that utterances become less descriptive over time while their discriminativeness remains unchanged. Through analysis, we propose that this trend could be due to participants relying on the previous mentions in the dialogue history, as well as being able to distill the most discriminative information from the visual context. In general, our study opens up the possibility of using this and similar models to quantify patterns in human data and shed light on the underlying cognitive mechanisms.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takmaz-etal-2022-less">
<titleInfo>
<title>Less Descriptive yet Discriminative: Quantifying the Properties of Multimodal Referring Utterances via CLIP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ece</namePart>
<namePart type="family">Takmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandro</namePart>
<namePart type="family">Pezzelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emmanuele</namePart>
<namePart type="family">Chersoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nora</namePart>
<namePart type="family">Hollenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cassandra</namePart>
<namePart type="family">Jacobs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yohei</namePart>
<namePart type="family">Oseki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Prévot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we use a transformer-based pre-trained multimodal model, CLIP, to shed light on the mechanisms employed by human speakers when referring to visual entities. In particular, we use CLIP to quantify the degree of descriptiveness (how well an utterance describes an image in isolation) and discriminativeness (to what extent an utterance is effective in picking out a single image among similar images) of human referring utterances within multimodal dialogues. Overall, our results show that utterances become less descriptive over time while their discriminativeness remains unchanged. Through analysis, we propose that this trend could be due to participants relying on the previous mentions in the dialogue history, as well as being able to distill the most discriminative information from the visual context. In general, our study opens up the possibility of using this and similar models to quantify patterns in human data and shed light on the underlying cognitive mechanisms.</abstract>
<identifier type="citekey">takmaz-etal-2022-less</identifier>
<identifier type="doi">10.18653/v1/2022.cmcl-1.4</identifier>
<location>
<url>https://aclanthology.org/2022.cmcl-1.4</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>36</start>
<end>42</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Less Descriptive yet Discriminative: Quantifying the Properties of Multimodal Referring Utterances via CLIP
%A Takmaz, Ece
%A Pezzelle, Sandro
%A Fernández, Raquel
%Y Chersoni, Emmanuele
%Y Hollenstein, Nora
%Y Jacobs, Cassandra
%Y Oseki, Yohei
%Y Prévot, Laurent
%Y Santus, Enrico
%S Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F takmaz-etal-2022-less
%X In this work, we use a transformer-based pre-trained multimodal model, CLIP, to shed light on the mechanisms employed by human speakers when referring to visual entities. In particular, we use CLIP to quantify the degree of descriptiveness (how well an utterance describes an image in isolation) and discriminativeness (to what extent an utterance is effective in picking out a single image among similar images) of human referring utterances within multimodal dialogues. Overall, our results show that utterances become less descriptive over time while their discriminativeness remains unchanged. Through analysis, we propose that this trend could be due to participants relying on the previous mentions in the dialogue history, as well as being able to distill the most discriminative information from the visual context. In general, our study opens up the possibility of using this and similar models to quantify patterns in human data and shed light on the underlying cognitive mechanisms.
%R 10.18653/v1/2022.cmcl-1.4
%U https://aclanthology.org/2022.cmcl-1.4
%U https://doi.org/10.18653/v1/2022.cmcl-1.4
%P 36-42
Markdown (Informal)
[Less Descriptive yet Discriminative: Quantifying the Properties of Multimodal Referring Utterances via CLIP](https://aclanthology.org/2022.cmcl-1.4) (Takmaz et al., CMCL 2022)
ACL