@article{pouw-etal-2024-perception,
title = "Perception of Phonological Assimilation by Neural Speech Recognition Models",
author = "Pouw, Charlotte and
Kloots, Marianne de Heer and
Alishahi, Afra and
Zuidema, Willem",
journal = "Computational Linguistics",
volume = "50",
number = "3",
month = dec,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-4.11/",
doi = "10.1162/coli_a_00526",
pages = "1557--1585",
abstract = "Human listeners effortlessly compensate for phonological changes during speech perception, often unconsciously inferring the intended sounds. For example, listeners infer the underlying /n/ when hearing an utterance such as {\textquotedblleft}clea[m] pan{\textquotedblright}, where [m] arises from place assimilation to the following labial [p]. This article explores how the neural speech recognition model Wav2Vec2 perceives assimilated sounds, and identifies the linguistic knowledge that is implemented by the model to compensate for assimilation during Automatic Speech Recognition (ASR). Using psycholinguistic stimuli, we systematically analyze how various linguistic context cues influence compensation patterns in the model`s output. Complementing these behavioral experiments, our probing experiments indicate that the model shifts its interpretation of assimilated sounds from their acoustic form to their underlying form in its final layers. Finally, our causal intervention experiments suggest that the model relies on minimal phonological context cues to accomplish this shift. These findings represent a step towards better understanding the similarities and differences in phonological processing between neural ASR models and humans."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pouw-etal-2024-perception">
<titleInfo>
<title>Perception of Phonological Assimilation by Neural Speech Recognition Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Pouw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianne</namePart>
<namePart type="given">de</namePart>
<namePart type="given">Heer</namePart>
<namePart type="family">Kloots</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willem</namePart>
<namePart type="family">Zuidema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Human listeners effortlessly compensate for phonological changes during speech perception, often unconsciously inferring the intended sounds. For example, listeners infer the underlying /n/ when hearing an utterance such as “clea[m] pan”, where [m] arises from place assimilation to the following labial [p]. This article explores how the neural speech recognition model Wav2Vec2 perceives assimilated sounds, and identifies the linguistic knowledge that is implemented by the model to compensate for assimilation during Automatic Speech Recognition (ASR). Using psycholinguistic stimuli, we systematically analyze how various linguistic context cues influence compensation patterns in the model‘s output. Complementing these behavioral experiments, our probing experiments indicate that the model shifts its interpretation of assimilated sounds from their acoustic form to their underlying form in its final layers. Finally, our causal intervention experiments suggest that the model relies on minimal phonological context cues to accomplish this shift. These findings represent a step towards better understanding the similarities and differences in phonological processing between neural ASR models and humans.</abstract>
<identifier type="citekey">pouw-etal-2024-perception</identifier>
<identifier type="doi">10.1162/coli_a_00526</identifier>
<location>
<url>https://aclanthology.org/2024.cl-4.11/</url>
</location>
<part>
<date>2024-12</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>1557</start>
<end>1585</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Perception of Phonological Assimilation by Neural Speech Recognition Models
%A Pouw, Charlotte
%A Kloots, Marianne de Heer
%A Alishahi, Afra
%A Zuidema, Willem
%J Computational Linguistics
%D 2024
%8 December
%V 50
%N 3
%I MIT Press
%C Cambridge, MA
%F pouw-etal-2024-perception
%X Human listeners effortlessly compensate for phonological changes during speech perception, often unconsciously inferring the intended sounds. For example, listeners infer the underlying /n/ when hearing an utterance such as “clea[m] pan”, where [m] arises from place assimilation to the following labial [p]. This article explores how the neural speech recognition model Wav2Vec2 perceives assimilated sounds, and identifies the linguistic knowledge that is implemented by the model to compensate for assimilation during Automatic Speech Recognition (ASR). Using psycholinguistic stimuli, we systematically analyze how various linguistic context cues influence compensation patterns in the model‘s output. Complementing these behavioral experiments, our probing experiments indicate that the model shifts its interpretation of assimilated sounds from their acoustic form to their underlying form in its final layers. Finally, our causal intervention experiments suggest that the model relies on minimal phonological context cues to accomplish this shift. These findings represent a step towards better understanding the similarities and differences in phonological processing between neural ASR models and humans.
%R 10.1162/coli_a_00526
%U https://aclanthology.org/2024.cl-4.11/
%U https://doi.org/10.1162/coli_a_00526
%P 1557-1585
Markdown (Informal)
[Perception of Phonological Assimilation by Neural Speech Recognition Models](https://aclanthology.org/2024.cl-4.11/) (Pouw et al., CL 2024)
ACL