@inproceedings{cambara-etal-2022-recycle,
title = "Recycle Your {W}av2{V}ec2 Codebook: A Speech Perceiver for Keyword Spotting",
author = "C{\'a}mbara, Guillermo and
Luque, Jordi and
Farr{\'u}s, Mireia",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.626",
pages = "7166--7170",
abstract = "Speech information in a pretrained wav2vec2.0 model is usually leveraged through its encoder, which has at least 95M parameters, being not so suitable for small footprint Keyword Spotting. In this work, we show an efficient way of profiting from wav2vec2.0{'}s linguistic knowledge, by recycling the phonetic information encoded in its latent codebook, which has been typically thrown away after pretraining. We do so by transferring the codebook as weights for the latent bottleneck of a Keyword Spotting Perceiver, thus initializing such model with phonetic embeddings already. The Perceiver design relies on cross-attention between these embeddings and input data to generate better representations. Our method delivers accuracy gains compared to random initialization, at no latency costs. Plus, we show that the phonetic embeddings can easily be downsampled with k-means clustering, speeding up inference in 3.5 times at only slight accuracy penalties.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cambara-etal-2022-recycle">
<titleInfo>
<title>Recycle Your Wav2Vec2 Codebook: A Speech Perceiver for Keyword Spotting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Cámbara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordi</namePart>
<namePart type="family">Luque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mireia</namePart>
<namePart type="family">Farrús</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speech information in a pretrained wav2vec2.0 model is usually leveraged through its encoder, which has at least 95M parameters, being not so suitable for small footprint Keyword Spotting. In this work, we show an efficient way of profiting from wav2vec2.0’s linguistic knowledge, by recycling the phonetic information encoded in its latent codebook, which has been typically thrown away after pretraining. We do so by transferring the codebook as weights for the latent bottleneck of a Keyword Spotting Perceiver, thus initializing such model with phonetic embeddings already. The Perceiver design relies on cross-attention between these embeddings and input data to generate better representations. Our method delivers accuracy gains compared to random initialization, at no latency costs. Plus, we show that the phonetic embeddings can easily be downsampled with k-means clustering, speeding up inference in 3.5 times at only slight accuracy penalties.</abstract>
<identifier type="citekey">cambara-etal-2022-recycle</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.626</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>7166</start>
<end>7170</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Recycle Your Wav2Vec2 Codebook: A Speech Perceiver for Keyword Spotting
%A Cámbara, Guillermo
%A Luque, Jordi
%A Farrús, Mireia
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F cambara-etal-2022-recycle
%X Speech information in a pretrained wav2vec2.0 model is usually leveraged through its encoder, which has at least 95M parameters, being not so suitable for small footprint Keyword Spotting. In this work, we show an efficient way of profiting from wav2vec2.0’s linguistic knowledge, by recycling the phonetic information encoded in its latent codebook, which has been typically thrown away after pretraining. We do so by transferring the codebook as weights for the latent bottleneck of a Keyword Spotting Perceiver, thus initializing such model with phonetic embeddings already. The Perceiver design relies on cross-attention between these embeddings and input data to generate better representations. Our method delivers accuracy gains compared to random initialization, at no latency costs. Plus, we show that the phonetic embeddings can easily be downsampled with k-means clustering, speeding up inference in 3.5 times at only slight accuracy penalties.
%U https://aclanthology.org/2022.coling-1.626
%P 7166-7170
Markdown (Informal)
[Recycle Your Wav2Vec2 Codebook: A Speech Perceiver for Keyword Spotting](https://aclanthology.org/2022.coling-1.626) (Cámbara et al., COLING 2022)
ACL