@inproceedings{rodriguez-etal-2024-self,
title = "Self-supervised speech representations display some human-like cross-linguistic perceptual abilities",
author = "Rodriguez, Joselyn and
Sreepada, Kamala and
Famularo, Ruolan Leslie and
Goldwater, Sharon and
Feldman, Naomi",
editor = "Barak, Libby and
Alikhani, Malihe",
booktitle = "Proceedings of the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-1.35",
pages = "458--463",
abstract = "State of the art models in automatic speech recognition have shown remarkable improvements due to modern self-supervised (SSL) transformer-based architectures such as wav2vec 2.0 (Baevski et al., 2020). However, how these models encode phonetic information is still not well understood. We explore whether SSL speech models display a linguistic property that characterizes human speech perception: language specificity. We show that while wav2vec 2.0 displays an overall language specificity effect when tested on Hindi vs. English, it does not resemble human speech perception when tested on finer-grained differences in Hindi speech contrasts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rodriguez-etal-2024-self">
<titleInfo>
<title>Self-supervised speech representations display some human-like cross-linguistic perceptual abilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joselyn</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kamala</namePart>
<namePart type="family">Sreepada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruolan</namePart>
<namePart type="given">Leslie</namePart>
<namePart type="family">Famularo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharon</namePart>
<namePart type="family">Goldwater</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naomi</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Libby</namePart>
<namePart type="family">Barak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>State of the art models in automatic speech recognition have shown remarkable improvements due to modern self-supervised (SSL) transformer-based architectures such as wav2vec 2.0 (Baevski et al., 2020). However, how these models encode phonetic information is still not well understood. We explore whether SSL speech models display a linguistic property that characterizes human speech perception: language specificity. We show that while wav2vec 2.0 displays an overall language specificity effect when tested on Hindi vs. English, it does not resemble human speech perception when tested on finer-grained differences in Hindi speech contrasts.</abstract>
<identifier type="citekey">rodriguez-etal-2024-self</identifier>
<location>
<url>https://aclanthology.org/2024.conll-1.35</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>458</start>
<end>463</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Self-supervised speech representations display some human-like cross-linguistic perceptual abilities
%A Rodriguez, Joselyn
%A Sreepada, Kamala
%A Famularo, Ruolan Leslie
%A Goldwater, Sharon
%A Feldman, Naomi
%Y Barak, Libby
%Y Alikhani, Malihe
%S Proceedings of the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F rodriguez-etal-2024-self
%X State of the art models in automatic speech recognition have shown remarkable improvements due to modern self-supervised (SSL) transformer-based architectures such as wav2vec 2.0 (Baevski et al., 2020). However, how these models encode phonetic information is still not well understood. We explore whether SSL speech models display a linguistic property that characterizes human speech perception: language specificity. We show that while wav2vec 2.0 displays an overall language specificity effect when tested on Hindi vs. English, it does not resemble human speech perception when tested on finer-grained differences in Hindi speech contrasts.
%U https://aclanthology.org/2024.conll-1.35
%P 458-463
Markdown (Informal)
[Self-supervised speech representations display some human-like cross-linguistic perceptual abilities](https://aclanthology.org/2024.conll-1.35) (Rodriguez et al., CoNLL 2024)
ACL