@inproceedings{hessel-etal-2019-case,
title = "A Case Study on Combining {ASR} and Visual Features for Generating Instructional Video Captions",
author = "Hessel, Jack and
Pang, Bo and
Zhu, Zhenhai and
Soricut, Radu",
editor = "Bansal, Mohit and
Villavicencio, Aline",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K19-1039/",
doi = "10.18653/v1/K19-1039",
pages = "419--429",
abstract = "Instructional videos get high-traffic on video sharing platforms, and prior work suggests that providing time-stamped, subtask annotations (e.g., {\textquotedblleft}heat the oil in the pan{\textquotedblright}) improves user experiences. However, current automatic annotation methods based on visual features alone perform only slightly better than constant prediction. Taking cues from prior work, we show that we can improve performance significantly by considering automatic speech recognition (ASR) tokens as input. Furthermore, jointly modeling ASR tokens and visual features results in higher performance compared to training individually on either modality. We find that unstated background information is better explained by visual features, whereas fine-grained distinctions (e.g., {\textquotedblleft}add oil{\textquotedblright} vs. {\textquotedblleft}add olive oil{\textquotedblright}) are disambiguated more easily via ASR tokens."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hessel-etal-2019-case">
<titleInfo>
<title>A Case Study on Combining ASR and Visual Features for Generating Instructional Video Captions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Hessel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenhai</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Soricut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Instructional videos get high-traffic on video sharing platforms, and prior work suggests that providing time-stamped, subtask annotations (e.g., “heat the oil in the pan”) improves user experiences. However, current automatic annotation methods based on visual features alone perform only slightly better than constant prediction. Taking cues from prior work, we show that we can improve performance significantly by considering automatic speech recognition (ASR) tokens as input. Furthermore, jointly modeling ASR tokens and visual features results in higher performance compared to training individually on either modality. We find that unstated background information is better explained by visual features, whereas fine-grained distinctions (e.g., “add oil” vs. “add olive oil”) are disambiguated more easily via ASR tokens.</abstract>
<identifier type="citekey">hessel-etal-2019-case</identifier>
<identifier type="doi">10.18653/v1/K19-1039</identifier>
<location>
<url>https://aclanthology.org/K19-1039/</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>419</start>
<end>429</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Case Study on Combining ASR and Visual Features for Generating Instructional Video Captions
%A Hessel, Jack
%A Pang, Bo
%A Zhu, Zhenhai
%A Soricut, Radu
%Y Bansal, Mohit
%Y Villavicencio, Aline
%S Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F hessel-etal-2019-case
%X Instructional videos get high-traffic on video sharing platforms, and prior work suggests that providing time-stamped, subtask annotations (e.g., “heat the oil in the pan”) improves user experiences. However, current automatic annotation methods based on visual features alone perform only slightly better than constant prediction. Taking cues from prior work, we show that we can improve performance significantly by considering automatic speech recognition (ASR) tokens as input. Furthermore, jointly modeling ASR tokens and visual features results in higher performance compared to training individually on either modality. We find that unstated background information is better explained by visual features, whereas fine-grained distinctions (e.g., “add oil” vs. “add olive oil”) are disambiguated more easily via ASR tokens.
%R 10.18653/v1/K19-1039
%U https://aclanthology.org/K19-1039/
%U https://doi.org/10.18653/v1/K19-1039
%P 419-429
Markdown (Informal)
[A Case Study on Combining ASR and Visual Features for Generating Instructional Video Captions](https://aclanthology.org/K19-1039/) (Hessel et al., CoNLL 2019)
ACL