@inproceedings{bogolin-etal-2020-hierarchical,
title = "A hierarchical approach to vision-based language generation: from simple sentences to complex natural language",
author = "Bogolin, Simion-Vlad and
Croitoru, Ioana and
Leordeanu, Marius",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.220",
doi = "10.18653/v1/2020.coling-main.220",
pages = "2436--2447",
abstract = "Automatically describing videos in natural language is an ambitious problem, which could bridge our understanding of vision and language. We propose a hierarchical approach, by first generating video descriptions as sequences of simple sentences, followed at the next level by a more complex and fluent description in natural language. While the simple sentences describe simple actions in the form of (subject, verb, object), the second-level paragraph descriptions, indirectly using information from the first-level description, presents the visual content in a more compact, coherent and semantically rich manner. To this end, we introduce the first video dataset in the literature that is annotated with captions at two levels of linguistic complexity. We perform extensive tests that demonstrate that our hierarchical linguistic representation, from simple to complex language, allows us to train a two-stage network that is able to generate significantly more complex paragraphs than current one-stage approaches.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bogolin-etal-2020-hierarchical">
<titleInfo>
<title>A hierarchical approach to vision-based language generation: from simple sentences to complex natural language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simion-Vlad</namePart>
<namePart type="family">Bogolin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioana</namePart>
<namePart type="family">Croitoru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Leordeanu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatically describing videos in natural language is an ambitious problem, which could bridge our understanding of vision and language. We propose a hierarchical approach, by first generating video descriptions as sequences of simple sentences, followed at the next level by a more complex and fluent description in natural language. While the simple sentences describe simple actions in the form of (subject, verb, object), the second-level paragraph descriptions, indirectly using information from the first-level description, presents the visual content in a more compact, coherent and semantically rich manner. To this end, we introduce the first video dataset in the literature that is annotated with captions at two levels of linguistic complexity. We perform extensive tests that demonstrate that our hierarchical linguistic representation, from simple to complex language, allows us to train a two-stage network that is able to generate significantly more complex paragraphs than current one-stage approaches.</abstract>
<identifier type="citekey">bogolin-etal-2020-hierarchical</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.220</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.220</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>2436</start>
<end>2447</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A hierarchical approach to vision-based language generation: from simple sentences to complex natural language
%A Bogolin, Simion-Vlad
%A Croitoru, Ioana
%A Leordeanu, Marius
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F bogolin-etal-2020-hierarchical
%X Automatically describing videos in natural language is an ambitious problem, which could bridge our understanding of vision and language. We propose a hierarchical approach, by first generating video descriptions as sequences of simple sentences, followed at the next level by a more complex and fluent description in natural language. While the simple sentences describe simple actions in the form of (subject, verb, object), the second-level paragraph descriptions, indirectly using information from the first-level description, presents the visual content in a more compact, coherent and semantically rich manner. To this end, we introduce the first video dataset in the literature that is annotated with captions at two levels of linguistic complexity. We perform extensive tests that demonstrate that our hierarchical linguistic representation, from simple to complex language, allows us to train a two-stage network that is able to generate significantly more complex paragraphs than current one-stage approaches.
%R 10.18653/v1/2020.coling-main.220
%U https://aclanthology.org/2020.coling-main.220
%U https://doi.org/10.18653/v1/2020.coling-main.220
%P 2436-2447
Markdown (Informal)
[A hierarchical approach to vision-based language generation: from simple sentences to complex natural language](https://aclanthology.org/2020.coling-main.220) (Bogolin et al., COLING 2020)
ACL