@inproceedings{kumar-etal-2024-tokenverse,
title = "{T}oken{V}erse: Towards Unifying Speech and {NLP} Tasks via Transducer-based {ASR}",
author = "Kumar, Shashi and
Madikeri, Srikanth and
Zuluaga Gomez, Juan Pablo and
Thorbecke, Iuliia and
Villatoro-tello, Esa{\'u} and
Burdisso, Sergio and
Motlicek, Petr and
S, Karthik and
Ganapathiraju, Aravind",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1167",
pages = "20988--20995",
abstract = "In traditional conversational intelligence from speech, a cascaded pipeline is used, involving tasks such as voice activity detection, diarization, transcription, and subsequent processing with different NLP models for tasks like semantic endpointing and named entity recognition (NER). Our paper introduces TokenVerse, a single Transducer-based model designed to handle multiple tasks. This is achieved by integrating task-specific tokens into the reference text during ASR model training, streamlining the inference and eliminating the need for separate NLP models. In addition to ASR, we conduct experiments on 3 different tasks: speaker change detection, endpointing, and NER. Our experiments on a public and a private dataset show that the proposed method improves ASR by up to 7.7{\%} in relative WER while outperforming the cascaded pipeline approach in individual task performance. Our code is publicly available: https://github.com/idiap/tokenverse-unifying-speech-nlp",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2024-tokenverse">
<titleInfo>
<title>TokenVerse: Towards Unifying Speech and NLP Tasks via Transducer-based ASR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srikanth</namePart>
<namePart type="family">Madikeri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Pablo</namePart>
<namePart type="family">Zuluaga Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iuliia</namePart>
<namePart type="family">Thorbecke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esaú</namePart>
<namePart type="family">Villatoro-tello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Burdisso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Motlicek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthik</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aravind</namePart>
<namePart type="family">Ganapathiraju</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In traditional conversational intelligence from speech, a cascaded pipeline is used, involving tasks such as voice activity detection, diarization, transcription, and subsequent processing with different NLP models for tasks like semantic endpointing and named entity recognition (NER). Our paper introduces TokenVerse, a single Transducer-based model designed to handle multiple tasks. This is achieved by integrating task-specific tokens into the reference text during ASR model training, streamlining the inference and eliminating the need for separate NLP models. In addition to ASR, we conduct experiments on 3 different tasks: speaker change detection, endpointing, and NER. Our experiments on a public and a private dataset show that the proposed method improves ASR by up to 7.7% in relative WER while outperforming the cascaded pipeline approach in individual task performance. Our code is publicly available: https://github.com/idiap/tokenverse-unifying-speech-nlp</abstract>
<identifier type="citekey">kumar-etal-2024-tokenverse</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1167</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>20988</start>
<end>20995</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TokenVerse: Towards Unifying Speech and NLP Tasks via Transducer-based ASR
%A Kumar, Shashi
%A Madikeri, Srikanth
%A Zuluaga Gomez, Juan Pablo
%A Thorbecke, Iuliia
%A Villatoro-tello, Esaú
%A Burdisso, Sergio
%A Motlicek, Petr
%A S, Karthik
%A Ganapathiraju, Aravind
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F kumar-etal-2024-tokenverse
%X In traditional conversational intelligence from speech, a cascaded pipeline is used, involving tasks such as voice activity detection, diarization, transcription, and subsequent processing with different NLP models for tasks like semantic endpointing and named entity recognition (NER). Our paper introduces TokenVerse, a single Transducer-based model designed to handle multiple tasks. This is achieved by integrating task-specific tokens into the reference text during ASR model training, streamlining the inference and eliminating the need for separate NLP models. In addition to ASR, we conduct experiments on 3 different tasks: speaker change detection, endpointing, and NER. Our experiments on a public and a private dataset show that the proposed method improves ASR by up to 7.7% in relative WER while outperforming the cascaded pipeline approach in individual task performance. Our code is publicly available: https://github.com/idiap/tokenverse-unifying-speech-nlp
%U https://aclanthology.org/2024.emnlp-main.1167
%P 20988-20995
Markdown (Informal)
[TokenVerse: Towards Unifying Speech and NLP Tasks via Transducer-based ASR](https://aclanthology.org/2024.emnlp-main.1167) (Kumar et al., EMNLP 2024)
ACL
- Shashi Kumar, Srikanth Madikeri, Juan Pablo Zuluaga Gomez, Iuliia Thorbecke, Esaú Villatoro-tello, Sergio Burdisso, Petr Motlicek, Karthik S, and Aravind Ganapathiraju. 2024. TokenVerse: Towards Unifying Speech and NLP Tasks via Transducer-based ASR. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 20988–20995, Miami, Florida, USA. Association for Computational Linguistics.