@inproceedings{han-zarriess-2019-sketch,
title = "Sketch Me if You Can: Towards Generating Detailed Descriptions of Object Shape by Grounding in Images and Drawings",
author = "Han, Ting and
Zarrie{\ss}, Sina",
editor = "van Deemter, Kees and
Lin, Chenghua and
Takamura, Hiroya",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
month = oct # "{--}" # nov,
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8618",
doi = "10.18653/v1/W19-8618",
pages = "136--140",
abstract = "A lot of recent work in Language {\&} Vision has looked at generating descriptions or referring expressions for objects in scenes of real-world images, though focusing mostly on relatively simple language like object names, color and location attributes (e.g., brown chair on the left). This paper presents work on Draw-and-Tell, a dataset of detailed descriptions for common objects in images where annotators have produced fine-grained attribute-centric expressions distinguishing a target object from a range of similar objects. Additionally, the dataset comes with hand-drawn sketches for each object. As Draw-and-Tell is medium-sized and contains a rich vocabulary, it constitutes an interesting challenge for CNN-LSTM architectures used in state-of-the-art image captioning models. We explore whether the additional modality given through sketches can help such a model to learn to accurately ground detailed language referring expressions to object shapes. Our results are encouraging.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-zarriess-2019-sketch">
<titleInfo>
<title>Sketch Me if You Can: Towards Generating Detailed Descriptions of Object Shape by Grounding in Images and Drawings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ting</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Zarrieß</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-oct–nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A lot of recent work in Language & Vision has looked at generating descriptions or referring expressions for objects in scenes of real-world images, though focusing mostly on relatively simple language like object names, color and location attributes (e.g., brown chair on the left). This paper presents work on Draw-and-Tell, a dataset of detailed descriptions for common objects in images where annotators have produced fine-grained attribute-centric expressions distinguishing a target object from a range of similar objects. Additionally, the dataset comes with hand-drawn sketches for each object. As Draw-and-Tell is medium-sized and contains a rich vocabulary, it constitutes an interesting challenge for CNN-LSTM architectures used in state-of-the-art image captioning models. We explore whether the additional modality given through sketches can help such a model to learn to accurately ground detailed language referring expressions to object shapes. Our results are encouraging.</abstract>
<identifier type="citekey">han-zarriess-2019-sketch</identifier>
<identifier type="doi">10.18653/v1/W19-8618</identifier>
<location>
<url>https://aclanthology.org/W19-8618</url>
</location>
<part>
<date>2019-oct–nov</date>
<extent unit="page">
<start>136</start>
<end>140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sketch Me if You Can: Towards Generating Detailed Descriptions of Object Shape by Grounding in Images and Drawings
%A Han, Ting
%A Zarrieß, Sina
%Y van Deemter, Kees
%Y Lin, Chenghua
%Y Takamura, Hiroya
%S Proceedings of the 12th International Conference on Natural Language Generation
%D 2019
%8 oct–nov
%I Association for Computational Linguistics
%C Tokyo, Japan
%F han-zarriess-2019-sketch
%X A lot of recent work in Language & Vision has looked at generating descriptions or referring expressions for objects in scenes of real-world images, though focusing mostly on relatively simple language like object names, color and location attributes (e.g., brown chair on the left). This paper presents work on Draw-and-Tell, a dataset of detailed descriptions for common objects in images where annotators have produced fine-grained attribute-centric expressions distinguishing a target object from a range of similar objects. Additionally, the dataset comes with hand-drawn sketches for each object. As Draw-and-Tell is medium-sized and contains a rich vocabulary, it constitutes an interesting challenge for CNN-LSTM architectures used in state-of-the-art image captioning models. We explore whether the additional modality given through sketches can help such a model to learn to accurately ground detailed language referring expressions to object shapes. Our results are encouraging.
%R 10.18653/v1/W19-8618
%U https://aclanthology.org/W19-8618
%U https://doi.org/10.18653/v1/W19-8618
%P 136-140
Markdown (Informal)
[Sketch Me if You Can: Towards Generating Detailed Descriptions of Object Shape by Grounding in Images and Drawings](https://aclanthology.org/W19-8618) (Han & Zarrieß, INLG 2019)
ACL