@inproceedings{vitale-etal-2024-modelling,
title = "Modelling Filled Particles and Prolongation Using End-to-end Automatic Speech Recognition Systems: A Quantitative and Qualitative Analysis.",
author = "Vitale, Vincenzo Norman and
Schettino, Loredana and
Cutugno, Francesco",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.107/",
pages = "990--996",
ISBN = "979-12-210-7060-6",
abstract = "State-of-the-art automatic speech recognition systems based on End-to-End models (E2E-ASRs) achieve remarkable perfor mances. However, phenomena that characterize spoken language such as fillers (eeh ehm) or segmental prolongations (theee) are still mostly considered as disrupting objects that should not be included to obtain optimal transcriptions, despite their acknowledged regularity and communicative value. A recent study showed that two types of pre-trained systems with the same Conformer-based encoding architecture but different decoders {--} a Connectionist Temporal Classification (CTC) decoder and a Transducer decoder {--} tend to model some speech features that are functional for the identification of filled pauses and prolongation in speech. This work builds upon these findings by investigating which of the two systems is better at fillers and prolongations detection tasks and by conducting an error analysis to deepen our understanding of how these systems work."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vitale-etal-2024-modelling">
<titleInfo>
<title>Modelling Filled Particles and Prolongation Using End-to-end Automatic Speech Recognition Systems: A Quantitative and Qualitative Analysis.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vincenzo</namePart>
<namePart type="given">Norman</namePart>
<namePart type="family">Vitale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loredana</namePart>
<namePart type="family">Schettino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Cutugno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>State-of-the-art automatic speech recognition systems based on End-to-End models (E2E-ASRs) achieve remarkable perfor mances. However, phenomena that characterize spoken language such as fillers (eeh ehm) or segmental prolongations (theee) are still mostly considered as disrupting objects that should not be included to obtain optimal transcriptions, despite their acknowledged regularity and communicative value. A recent study showed that two types of pre-trained systems with the same Conformer-based encoding architecture but different decoders – a Connectionist Temporal Classification (CTC) decoder and a Transducer decoder – tend to model some speech features that are functional for the identification of filled pauses and prolongation in speech. This work builds upon these findings by investigating which of the two systems is better at fillers and prolongations detection tasks and by conducting an error analysis to deepen our understanding of how these systems work.</abstract>
<identifier type="citekey">vitale-etal-2024-modelling</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.107/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>990</start>
<end>996</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modelling Filled Particles and Prolongation Using End-to-end Automatic Speech Recognition Systems: A Quantitative and Qualitative Analysis.
%A Vitale, Vincenzo Norman
%A Schettino, Loredana
%A Cutugno, Francesco
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F vitale-etal-2024-modelling
%X State-of-the-art automatic speech recognition systems based on End-to-End models (E2E-ASRs) achieve remarkable perfor mances. However, phenomena that characterize spoken language such as fillers (eeh ehm) or segmental prolongations (theee) are still mostly considered as disrupting objects that should not be included to obtain optimal transcriptions, despite their acknowledged regularity and communicative value. A recent study showed that two types of pre-trained systems with the same Conformer-based encoding architecture but different decoders – a Connectionist Temporal Classification (CTC) decoder and a Transducer decoder – tend to model some speech features that are functional for the identification of filled pauses and prolongation in speech. This work builds upon these findings by investigating which of the two systems is better at fillers and prolongations detection tasks and by conducting an error analysis to deepen our understanding of how these systems work.
%U https://aclanthology.org/2024.clicit-1.107/
%P 990-996
Markdown (Informal)
[Modelling Filled Particles and Prolongation Using End-to-end Automatic Speech Recognition Systems: A Quantitative and Qualitative Analysis.](https://aclanthology.org/2024.clicit-1.107/) (Vitale et al., CLiC-it 2024)
ACL