@inproceedings{alikhani-stone-2019-caption,
title = "{``}Caption{''} as a Coherence Relation: Evidence and Implications",
author = "Alikhani, Malihe and
Stone, Matthew",
editor = "Bernardi, Raffaella and
Fernandez, Raquel and
Gella, Spandana and
Kafle, Kushal and
Kanan, Christopher and
Lee, Stefan and
Nabi, Moin",
booktitle = "Proceedings of the Second Workshop on Shortcomings in Vision and Language",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-1806",
doi = "10.18653/v1/W19-1806",
pages = "58--67",
abstract = "We study verbs in image{--}text corpora, contrasting \textit{caption} corpora, where texts are explicitly written to characterize image content, with \textit{depiction} corpora, where texts and images may stand in more general relations. Captions show a distinctively limited distribution of verbs, with strong preferences for specific tense, aspect, lexical aspect, and semantic field. These limitations, which appear in data elicited by a range of methods, restrict the utility of caption corpora to inform image retrieval, multimodal document generation, and perceptually-grounded semantic models. We suggest that these limitations reflect the discourse constraints in play when subjects write texts to accompany imagery, so we argue that future development of image{--}text corpora should work to increase the diversity of event descriptions, while looking explicitly at the different ways text and imagery can be coherently related.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alikhani-stone-2019-caption">
<titleInfo>
<title>“Caption” as a Coherence Relation: Evidence and Implications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Stone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Shortcomings in Vision and Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raquel</namePart>
<namePart type="family">Fernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kushal</namePart>
<namePart type="family">Kafle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Kanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moin</namePart>
<namePart type="family">Nabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, Minnesota</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We study verbs in image–text corpora, contrasting caption corpora, where texts are explicitly written to characterize image content, with depiction corpora, where texts and images may stand in more general relations. Captions show a distinctively limited distribution of verbs, with strong preferences for specific tense, aspect, lexical aspect, and semantic field. These limitations, which appear in data elicited by a range of methods, restrict the utility of caption corpora to inform image retrieval, multimodal document generation, and perceptually-grounded semantic models. We suggest that these limitations reflect the discourse constraints in play when subjects write texts to accompany imagery, so we argue that future development of image–text corpora should work to increase the diversity of event descriptions, while looking explicitly at the different ways text and imagery can be coherently related.</abstract>
<identifier type="citekey">alikhani-stone-2019-caption</identifier>
<identifier type="doi">10.18653/v1/W19-1806</identifier>
<location>
<url>https://aclanthology.org/W19-1806</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>58</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “Caption” as a Coherence Relation: Evidence and Implications
%A Alikhani, Malihe
%A Stone, Matthew
%Y Bernardi, Raffaella
%Y Fernandez, Raquel
%Y Gella, Spandana
%Y Kafle, Kushal
%Y Kanan, Christopher
%Y Lee, Stefan
%Y Nabi, Moin
%S Proceedings of the Second Workshop on Shortcomings in Vision and Language
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F alikhani-stone-2019-caption
%X We study verbs in image–text corpora, contrasting caption corpora, where texts are explicitly written to characterize image content, with depiction corpora, where texts and images may stand in more general relations. Captions show a distinctively limited distribution of verbs, with strong preferences for specific tense, aspect, lexical aspect, and semantic field. These limitations, which appear in data elicited by a range of methods, restrict the utility of caption corpora to inform image retrieval, multimodal document generation, and perceptually-grounded semantic models. We suggest that these limitations reflect the discourse constraints in play when subjects write texts to accompany imagery, so we argue that future development of image–text corpora should work to increase the diversity of event descriptions, while looking explicitly at the different ways text and imagery can be coherently related.
%R 10.18653/v1/W19-1806
%U https://aclanthology.org/W19-1806
%U https://doi.org/10.18653/v1/W19-1806
%P 58-67
Markdown (Informal)
[“Caption” as a Coherence Relation: Evidence and Implications](https://aclanthology.org/W19-1806) (Alikhani & Stone, NAACL 2019)
ACL