@inproceedings{ahuja-etal-2020-gestures,
title = "No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures",
author = "Ahuja, Chaitanya and
Lee, Dong Won and
Ishii, Ryo and
Morency, Louis-Philippe",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.170",
doi = "10.18653/v1/2020.findings-emnlp.170",
pages = "1884--1895",
abstract = "We study relationships between spoken language and co-speech gestures in context of two key challenges. First, distributions of text and gestures are inherently skewed making it important to model the long tail. Second, gesture predictions are made at a subword level, making it important to learn relationships between language and acoustic cues. We introduce AISLe, which combines adversarial learning with importance sampling to strike a balance between precision and coverage. We propose the use of a multimodal multiscale attention block to perform subword alignment without the need of explicit alignment between language and acoustic cues. Finally, to empirically study the importance of language in this task, we extend the dataset proposed in Ahuja et al. (2020) with automatically extracted transcripts for audio signals. We substantiate the effectiveness of our approach through large-scale quantitative and user studies, which show that our proposed methodology significantly outperforms previous state-of-the-art approaches for gesture generation. Link to code, data and videos: \url{https://github.com/chahuja/aisle}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahuja-etal-2020-gestures">
<titleInfo>
<title>No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chaitanya</namePart>
<namePart type="family">Ahuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="given">Won</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryo</namePart>
<namePart type="family">Ishii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We study relationships between spoken language and co-speech gestures in context of two key challenges. First, distributions of text and gestures are inherently skewed making it important to model the long tail. Second, gesture predictions are made at a subword level, making it important to learn relationships between language and acoustic cues. We introduce AISLe, which combines adversarial learning with importance sampling to strike a balance between precision and coverage. We propose the use of a multimodal multiscale attention block to perform subword alignment without the need of explicit alignment between language and acoustic cues. Finally, to empirically study the importance of language in this task, we extend the dataset proposed in Ahuja et al. (2020) with automatically extracted transcripts for audio signals. We substantiate the effectiveness of our approach through large-scale quantitative and user studies, which show that our proposed methodology significantly outperforms previous state-of-the-art approaches for gesture generation. Link to code, data and videos: https://github.com/chahuja/aisle</abstract>
<identifier type="citekey">ahuja-etal-2020-gestures</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.170</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.170</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>1884</start>
<end>1895</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures
%A Ahuja, Chaitanya
%A Lee, Dong Won
%A Ishii, Ryo
%A Morency, Louis-Philippe
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F ahuja-etal-2020-gestures
%X We study relationships between spoken language and co-speech gestures in context of two key challenges. First, distributions of text and gestures are inherently skewed making it important to model the long tail. Second, gesture predictions are made at a subword level, making it important to learn relationships between language and acoustic cues. We introduce AISLe, which combines adversarial learning with importance sampling to strike a balance between precision and coverage. We propose the use of a multimodal multiscale attention block to perform subword alignment without the need of explicit alignment between language and acoustic cues. Finally, to empirically study the importance of language in this task, we extend the dataset proposed in Ahuja et al. (2020) with automatically extracted transcripts for audio signals. We substantiate the effectiveness of our approach through large-scale quantitative and user studies, which show that our proposed methodology significantly outperforms previous state-of-the-art approaches for gesture generation. Link to code, data and videos: https://github.com/chahuja/aisle
%R 10.18653/v1/2020.findings-emnlp.170
%U https://aclanthology.org/2020.findings-emnlp.170
%U https://doi.org/10.18653/v1/2020.findings-emnlp.170
%P 1884-1895
Markdown (Informal)
[No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures](https://aclanthology.org/2020.findings-emnlp.170) (Ahuja et al., Findings 2020)
ACL