@inproceedings{yang-etal-2022-self,
title = "Self-supervised Rewiring of Pre-trained Speech Encoders:Towards Faster Fine-tuning with Less Labels in Speech Processing",
author = "Yang, Hao and
Zhao, Jinming and
Haffari, Gholamreza and
Shareghi, Ehsan",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.141",
doi = "10.18653/v1/2022.findings-emnlp.141",
pages = "1952--1959",
abstract = "Pre-trained speech Transformers have facilitated great success across various speech processing tasks. However, fine-tuning these encoders for downstream tasks require sufficiently large training data to converge or to achieve state-of-the-art. In text domain this has been partly attributed to sub-optimality of the representation space in pre-trained Transformers. In this work, we take a sober look into pre-trained speech encoders and rewire their representation space without requiring any task-specific labels. Our method utilises neutrally synthesised version of audio inputs along with frame masking to construct positive pairs for contrastive self-supervised learning. When used for augmenting the wav2vec 2 encoder, we observe consistent improvement of isotropy in the representation space. Our experiments on 6 speech processing tasks, exhibit a significant convergence speedup during task fine-tuning as well as consistent task improvement, specially in low-resource settings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2022-self">
<titleInfo>
<title>Self-supervised Rewiring of Pre-trained Speech Encoders:Towards Faster Fine-tuning with Less Labels in Speech Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinming</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gholamreza</namePart>
<namePart type="family">Haffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pre-trained speech Transformers have facilitated great success across various speech processing tasks. However, fine-tuning these encoders for downstream tasks require sufficiently large training data to converge or to achieve state-of-the-art. In text domain this has been partly attributed to sub-optimality of the representation space in pre-trained Transformers. In this work, we take a sober look into pre-trained speech encoders and rewire their representation space without requiring any task-specific labels. Our method utilises neutrally synthesised version of audio inputs along with frame masking to construct positive pairs for contrastive self-supervised learning. When used for augmenting the wav2vec 2 encoder, we observe consistent improvement of isotropy in the representation space. Our experiments on 6 speech processing tasks, exhibit a significant convergence speedup during task fine-tuning as well as consistent task improvement, specially in low-resource settings.</abstract>
<identifier type="citekey">yang-etal-2022-self</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.141</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.141</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1952</start>
<end>1959</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Self-supervised Rewiring of Pre-trained Speech Encoders:Towards Faster Fine-tuning with Less Labels in Speech Processing
%A Yang, Hao
%A Zhao, Jinming
%A Haffari, Gholamreza
%A Shareghi, Ehsan
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F yang-etal-2022-self
%X Pre-trained speech Transformers have facilitated great success across various speech processing tasks. However, fine-tuning these encoders for downstream tasks require sufficiently large training data to converge or to achieve state-of-the-art. In text domain this has been partly attributed to sub-optimality of the representation space in pre-trained Transformers. In this work, we take a sober look into pre-trained speech encoders and rewire their representation space without requiring any task-specific labels. Our method utilises neutrally synthesised version of audio inputs along with frame masking to construct positive pairs for contrastive self-supervised learning. When used for augmenting the wav2vec 2 encoder, we observe consistent improvement of isotropy in the representation space. Our experiments on 6 speech processing tasks, exhibit a significant convergence speedup during task fine-tuning as well as consistent task improvement, specially in low-resource settings.
%R 10.18653/v1/2022.findings-emnlp.141
%U https://aclanthology.org/2022.findings-emnlp.141
%U https://doi.org/10.18653/v1/2022.findings-emnlp.141
%P 1952-1959
Markdown (Informal)
[Self-supervised Rewiring of Pre-trained Speech Encoders:Towards Faster Fine-tuning with Less Labels in Speech Processing](https://aclanthology.org/2022.findings-emnlp.141) (Yang et al., Findings 2022)
ACL