@inproceedings{raval-etal-2020-end,
title = "End-to-End Automatic Speech Recognition for {G}ujarati",
author = "Raval, Deepang and
Pathak, Vyom and
Patel, Muktan and
Bhatt, Brijesh",
editor = "Bhattacharyya, Pushpak and
Sharma, Dipti Misra and
Sangal, Rajeev",
booktitle = "Proceedings of the 17th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2020",
address = "Indian Institute of Technology Patna, Patna, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2020.icon-main.56",
pages = "409--419",
abstract = "We present a novel approach for improving the performance of an End-to-End speech recognition system for the Gujarati language. We follow a deep learning based approach which includes Convolutional Neural Network (CNN), Bi-directional Long Short Term Memory (BiLSTM) layers, Dense layers, and Connectionist Temporal Classification (CTC) as a loss function. In order to improve the performance of the system with the limited size of the dataset, we present a combined language model (WLM and CLM) based prefix decoding technique and Bidirectional Encoder Representations from Transformers (BERT) based post-processing technique. To gain key insights from our Automatic Speech Recognition (ASR) system, we proposed different analysis methods. These insights help to understand our ASR system based on a particular language (Gujarati) as well as can govern ASR systems{'} to improve the performance for low resource languages. We have trained the model on the Microsoft Speech Corpus, and we observe a 5.11{\%} decrease in Word Error Rate (WER) with respect to base-model WER.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="raval-etal-2020-end">
<titleInfo>
<title>End-to-End Automatic Speech Recognition for Gujarati</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepang</namePart>
<namePart type="family">Raval</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vyom</namePart>
<namePart type="family">Pathak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muktan</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brijesh</namePart>
<namePart type="family">Bhatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dipti</namePart>
<namePart type="given">Misra</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajeev</namePart>
<namePart type="family">Sangal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Indian Institute of Technology Patna, Patna, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a novel approach for improving the performance of an End-to-End speech recognition system for the Gujarati language. We follow a deep learning based approach which includes Convolutional Neural Network (CNN), Bi-directional Long Short Term Memory (BiLSTM) layers, Dense layers, and Connectionist Temporal Classification (CTC) as a loss function. In order to improve the performance of the system with the limited size of the dataset, we present a combined language model (WLM and CLM) based prefix decoding technique and Bidirectional Encoder Representations from Transformers (BERT) based post-processing technique. To gain key insights from our Automatic Speech Recognition (ASR) system, we proposed different analysis methods. These insights help to understand our ASR system based on a particular language (Gujarati) as well as can govern ASR systems’ to improve the performance for low resource languages. We have trained the model on the Microsoft Speech Corpus, and we observe a 5.11% decrease in Word Error Rate (WER) with respect to base-model WER.</abstract>
<identifier type="citekey">raval-etal-2020-end</identifier>
<location>
<url>https://aclanthology.org/2020.icon-main.56</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>409</start>
<end>419</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T End-to-End Automatic Speech Recognition for Gujarati
%A Raval, Deepang
%A Pathak, Vyom
%A Patel, Muktan
%A Bhatt, Brijesh
%Y Bhattacharyya, Pushpak
%Y Sharma, Dipti Misra
%Y Sangal, Rajeev
%S Proceedings of the 17th International Conference on Natural Language Processing (ICON)
%D 2020
%8 December
%I NLP Association of India (NLPAI)
%C Indian Institute of Technology Patna, Patna, India
%F raval-etal-2020-end
%X We present a novel approach for improving the performance of an End-to-End speech recognition system for the Gujarati language. We follow a deep learning based approach which includes Convolutional Neural Network (CNN), Bi-directional Long Short Term Memory (BiLSTM) layers, Dense layers, and Connectionist Temporal Classification (CTC) as a loss function. In order to improve the performance of the system with the limited size of the dataset, we present a combined language model (WLM and CLM) based prefix decoding technique and Bidirectional Encoder Representations from Transformers (BERT) based post-processing technique. To gain key insights from our Automatic Speech Recognition (ASR) system, we proposed different analysis methods. These insights help to understand our ASR system based on a particular language (Gujarati) as well as can govern ASR systems’ to improve the performance for low resource languages. We have trained the model on the Microsoft Speech Corpus, and we observe a 5.11% decrease in Word Error Rate (WER) with respect to base-model WER.
%U https://aclanthology.org/2020.icon-main.56
%P 409-419
Markdown (Informal)
[End-to-End Automatic Speech Recognition for Gujarati](https://aclanthology.org/2020.icon-main.56) (Raval et al., ICON 2020)
ACL
- Deepang Raval, Vyom Pathak, Muktan Patel, and Brijesh Bhatt. 2020. End-to-End Automatic Speech Recognition for Gujarati. In Proceedings of the 17th International Conference on Natural Language Processing (ICON), pages 409–419, Indian Institute of Technology Patna, Patna, India. NLP Association of India (NLPAI).