@inproceedings{karan-etal-2023-1,
title = "1-step Speech Understanding and Transcription Using {CTC} Loss",
author = "Karan, Singla and
Shahab, Jalalv and
Yeon-Jun, Kim and
Andrej, Ljolje and
Antonio Moreno, Daniel and
Srinivas, Bangalore and
Benjamin, Stern",
editor = "Jyoti, D. Pawar and
Sobha, Lalitha Devi",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.29",
pages = "370--377",
abstract = "Recent studies have made some progress in refining end-to-end (E2E) speech recognition encoders by applying Connectionist Temporal Classification (CTC) loss to enhance named entity recognition within transcriptions. However, these methods have been constrained by their exclusive use of the ASCII character set, allowing only a limited array of semantic labels. We propose 1SPU, a 1-step Speech Processing Unit which can recognize speech events (e.g: speaker change) or an NL event (Intent, Emotion) while also transcribing vocal content. It extends the E2E automatic speech recognition (ASR) system{'}s vocabulary by adding a set of unused placeholder symbols, conceptually akin to the {\textless}pad{\textgreater} tokens used in sequence modeling. These placeholders are then assigned to represent semantic events (in form of tags) and are integrated into the transcription process as distinct tokens. We demonstrate notable improvements on the SLUE benchmark and yields results that are on par with those for the SLURP dataset. Additionally, we provide a visual analysis of the system{'}s proficiency in accurately pinpointing meaningful tokens over time, illustrating the enhancement in transcription quality through the utilization of supplementary semantic tags.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karan-etal-2023-1">
<titleInfo>
<title>1-step Speech Understanding and Transcription Using CTC Loss</title>
</titleInfo>
<name type="personal">
<namePart type="given">Singla</namePart>
<namePart type="family">Karan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jalalv</namePart>
<namePart type="family">Shahab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kim</namePart>
<namePart type="family">Yeon-Jun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ljolje</namePart>
<namePart type="family">Andrej</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Antonio Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bangalore</namePart>
<namePart type="family">Srinivas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stern</namePart>
<namePart type="family">Benjamin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="given">Pawar</namePart>
<namePart type="family">Jyoti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lalitha</namePart>
<namePart type="given">Devi</namePart>
<namePart type="family">Sobha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent studies have made some progress in refining end-to-end (E2E) speech recognition encoders by applying Connectionist Temporal Classification (CTC) loss to enhance named entity recognition within transcriptions. However, these methods have been constrained by their exclusive use of the ASCII character set, allowing only a limited array of semantic labels. We propose 1SPU, a 1-step Speech Processing Unit which can recognize speech events (e.g: speaker change) or an NL event (Intent, Emotion) while also transcribing vocal content. It extends the E2E automatic speech recognition (ASR) system’s vocabulary by adding a set of unused placeholder symbols, conceptually akin to the \textlesspad\textgreater tokens used in sequence modeling. These placeholders are then assigned to represent semantic events (in form of tags) and are integrated into the transcription process as distinct tokens. We demonstrate notable improvements on the SLUE benchmark and yields results that are on par with those for the SLURP dataset. Additionally, we provide a visual analysis of the system’s proficiency in accurately pinpointing meaningful tokens over time, illustrating the enhancement in transcription quality through the utilization of supplementary semantic tags.</abstract>
<identifier type="citekey">karan-etal-2023-1</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.29</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>370</start>
<end>377</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 1-step Speech Understanding and Transcription Using CTC Loss
%A Karan, Singla
%A Shahab, Jalalv
%A Yeon-Jun, Kim
%A Andrej, Ljolje
%A Antonio Moreno, Daniel
%A Srinivas, Bangalore
%A Benjamin, Stern
%Y Jyoti, D. Pawar
%Y Sobha, Lalitha Devi
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F karan-etal-2023-1
%X Recent studies have made some progress in refining end-to-end (E2E) speech recognition encoders by applying Connectionist Temporal Classification (CTC) loss to enhance named entity recognition within transcriptions. However, these methods have been constrained by their exclusive use of the ASCII character set, allowing only a limited array of semantic labels. We propose 1SPU, a 1-step Speech Processing Unit which can recognize speech events (e.g: speaker change) or an NL event (Intent, Emotion) while also transcribing vocal content. It extends the E2E automatic speech recognition (ASR) system’s vocabulary by adding a set of unused placeholder symbols, conceptually akin to the \textlesspad\textgreater tokens used in sequence modeling. These placeholders are then assigned to represent semantic events (in form of tags) and are integrated into the transcription process as distinct tokens. We demonstrate notable improvements on the SLUE benchmark and yields results that are on par with those for the SLURP dataset. Additionally, we provide a visual analysis of the system’s proficiency in accurately pinpointing meaningful tokens over time, illustrating the enhancement in transcription quality through the utilization of supplementary semantic tags.
%U https://aclanthology.org/2023.icon-1.29
%P 370-377
Markdown (Informal)
[1-step Speech Understanding and Transcription Using CTC Loss](https://aclanthology.org/2023.icon-1.29) (Karan et al., ICON 2023)
ACL
- Singla Karan, Jalalv Shahab, Kim Yeon-Jun, Ljolje Andrej, Daniel Antonio Moreno, Bangalore Srinivas, and Stern Benjamin. 2023. 1-step Speech Understanding and Transcription Using CTC Loss. In Proceedings of the 20th International Conference on Natural Language Processing (ICON), pages 370–377, Goa University, Goa, India. NLP Association of India (NLPAI).