@article{georges-etal-2024-decode,
title = "Decode, Move and Speak! Self-supervised Learning of Speech Units, Gestures, and Sound Relationships Using Vocal Imitation",
author = "Georges, Marc-Antoine and
Lavechin, Marvin and
Schwartz, Jean-Luc and
Hueber, Thomas",
journal = "Computational Linguistics",
volume = "50",
number = "3",
month = dec,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-4.5/",
doi = "10.1162/coli_a_00532",
pages = "1345--1373",
abstract = "Speech learning encompasses mastering a complex motor system to produce speech sounds from articulatory gestures while simultaneously uncovering discrete units that provide entry to the linguistic system. Remarkably, children acquire these associations between speech sounds, articulatory gestures, and linguistic units in a weakly supervised manner, without the need for explicit labeling of auditory inputs or access to target articulatory gestures. This study uses self-supervised deep learning to investigate the respective roles of sounds, gestures, and linguistic units in speech acquisition and control. In a first experiment, we analyzed the quantized representations learned by vector-quantized variational autoencoders (VQ-VAE) from ground truth acoustic and articulatory data using ABX tests. We show an interesting complementarity between acoustic and articulatory modalities that may help in the discovery of phonemes. In a second experiment, we introduce a computational agent that repeats auditory speech inputs by controlling a virtual vocal apparatus. This agent integrates an articulatory synthesizer capable of reproducing diverse speech stimuli from interpretable parameters, along with two internal models implementing the articulatory-to-acoustic (forward) and acoustic-to-articulatory (inverse) mapping, respectively. Additionally, two inductive biases are used to regularize the ill-posed acoustic-to-articulatory inverse mapping. In line with the first experiment, we explore the complementarity between the auditory input and the articulatory parameters inferred by the agent. We also evaluate the impact of discretizing auditory inputs using VQ-VAE. While the majority of the agent`s productions are intelligible (according to perceptual evaluations), our analysis highlights inconsistencies in the underlying articulatory trajectories. In particular, we show that the agent`s productions only partially reproduce the complementarity between the auditory and articulatory modalities observed in humans."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="georges-etal-2024-decode">
<titleInfo>
<title>Decode, Move and Speak! Self-supervised Learning of Speech Units, Gestures, and Sound Relationships Using Vocal Imitation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marc-Antoine</namePart>
<namePart type="family">Georges</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marvin</namePart>
<namePart type="family">Lavechin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Luc</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hueber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Speech learning encompasses mastering a complex motor system to produce speech sounds from articulatory gestures while simultaneously uncovering discrete units that provide entry to the linguistic system. Remarkably, children acquire these associations between speech sounds, articulatory gestures, and linguistic units in a weakly supervised manner, without the need for explicit labeling of auditory inputs or access to target articulatory gestures. This study uses self-supervised deep learning to investigate the respective roles of sounds, gestures, and linguistic units in speech acquisition and control. In a first experiment, we analyzed the quantized representations learned by vector-quantized variational autoencoders (VQ-VAE) from ground truth acoustic and articulatory data using ABX tests. We show an interesting complementarity between acoustic and articulatory modalities that may help in the discovery of phonemes. In a second experiment, we introduce a computational agent that repeats auditory speech inputs by controlling a virtual vocal apparatus. This agent integrates an articulatory synthesizer capable of reproducing diverse speech stimuli from interpretable parameters, along with two internal models implementing the articulatory-to-acoustic (forward) and acoustic-to-articulatory (inverse) mapping, respectively. Additionally, two inductive biases are used to regularize the ill-posed acoustic-to-articulatory inverse mapping. In line with the first experiment, we explore the complementarity between the auditory input and the articulatory parameters inferred by the agent. We also evaluate the impact of discretizing auditory inputs using VQ-VAE. While the majority of the agent‘s productions are intelligible (according to perceptual evaluations), our analysis highlights inconsistencies in the underlying articulatory trajectories. In particular, we show that the agent‘s productions only partially reproduce the complementarity between the auditory and articulatory modalities observed in humans.</abstract>
<identifier type="citekey">georges-etal-2024-decode</identifier>
<identifier type="doi">10.1162/coli_a_00532</identifier>
<location>
<url>https://aclanthology.org/2024.cl-4.5/</url>
</location>
<part>
<date>2024-12</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>1345</start>
<end>1373</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Decode, Move and Speak! Self-supervised Learning of Speech Units, Gestures, and Sound Relationships Using Vocal Imitation
%A Georges, Marc-Antoine
%A Lavechin, Marvin
%A Schwartz, Jean-Luc
%A Hueber, Thomas
%J Computational Linguistics
%D 2024
%8 December
%V 50
%N 3
%I MIT Press
%C Cambridge, MA
%F georges-etal-2024-decode
%X Speech learning encompasses mastering a complex motor system to produce speech sounds from articulatory gestures while simultaneously uncovering discrete units that provide entry to the linguistic system. Remarkably, children acquire these associations between speech sounds, articulatory gestures, and linguistic units in a weakly supervised manner, without the need for explicit labeling of auditory inputs or access to target articulatory gestures. This study uses self-supervised deep learning to investigate the respective roles of sounds, gestures, and linguistic units in speech acquisition and control. In a first experiment, we analyzed the quantized representations learned by vector-quantized variational autoencoders (VQ-VAE) from ground truth acoustic and articulatory data using ABX tests. We show an interesting complementarity between acoustic and articulatory modalities that may help in the discovery of phonemes. In a second experiment, we introduce a computational agent that repeats auditory speech inputs by controlling a virtual vocal apparatus. This agent integrates an articulatory synthesizer capable of reproducing diverse speech stimuli from interpretable parameters, along with two internal models implementing the articulatory-to-acoustic (forward) and acoustic-to-articulatory (inverse) mapping, respectively. Additionally, two inductive biases are used to regularize the ill-posed acoustic-to-articulatory inverse mapping. In line with the first experiment, we explore the complementarity between the auditory input and the articulatory parameters inferred by the agent. We also evaluate the impact of discretizing auditory inputs using VQ-VAE. While the majority of the agent‘s productions are intelligible (according to perceptual evaluations), our analysis highlights inconsistencies in the underlying articulatory trajectories. In particular, we show that the agent‘s productions only partially reproduce the complementarity between the auditory and articulatory modalities observed in humans.
%R 10.1162/coli_a_00532
%U https://aclanthology.org/2024.cl-4.5/
%U https://doi.org/10.1162/coli_a_00532
%P 1345-1373
Markdown (Informal)
[Decode, Move and Speak! Self-supervised Learning of Speech Units, Gestures, and Sound Relationships Using Vocal Imitation](https://aclanthology.org/2024.cl-4.5/) (Georges et al., CL 2024)
ACL