@inproceedings{haswani-mohankumar-2022-methods,
title = "Methods to Optimize {W}av2{V}ec with Language Model for Automatic Speech Recognition in Resource Constrained Environment",
author = "Haswani, Vaibhav and
Mohankumar, Padmapriya",
editor = "Akhtar, Md. Shad and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2022",
address = "New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-main.20",
pages = "149--153",
abstract = "Automatic Speech Recognition (ASR) on resource constrained environment is a complex task since most of the State-Of-The-Art models are combination of multilayered convolutional neural network (CNN) and Transformer models which itself requires huge resources such as GPU or TPU for training as well as inference. The accuracy as a performance metric of an ASR system depends upon the efficiency of phonemes to word translation of an Acoustic Model and context correction of the Language model. However, inference as a performance metric is also an important aspect, which mostly depends upon the resources. Also, most of the ASR models uses transformer models at its core and one caveat of transformers is that it usually has a finite amount of sequence length it can handle. Either because it uses position encodings or simply because the cost of attention in transformers is actually O(n{\mbox{$^2$}}) in sequence length, meaning that using very large sequence length explodes in complexity/memory. So you cannot run the system with finite hardware even a very high-end GPU, because if we inference even a one hour long audio with Wav2Vec the system will crash. In this paper, we used some state-of-the-art methods to optimize the Wav2Vec model for better accuracy of predictions in resource constrained systems. In addition, we have performed tests with other SOTA models such as Citrinet and Quartznet for the comparative analysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haswani-mohankumar-2022-methods">
<titleInfo>
<title>Methods to Optimize Wav2Vec with Language Model for Automatic Speech Recognition in Resource Constrained Environment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vaibhav</namePart>
<namePart type="family">Haswani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Padmapriya</namePart>
<namePart type="family">Mohankumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) on resource constrained environment is a complex task since most of the State-Of-The-Art models are combination of multilayered convolutional neural network (CNN) and Transformer models which itself requires huge resources such as GPU or TPU for training as well as inference. The accuracy as a performance metric of an ASR system depends upon the efficiency of phonemes to word translation of an Acoustic Model and context correction of the Language model. However, inference as a performance metric is also an important aspect, which mostly depends upon the resources. Also, most of the ASR models uses transformer models at its core and one caveat of transformers is that it usually has a finite amount of sequence length it can handle. Either because it uses position encodings or simply because the cost of attention in transformers is actually O(n²) in sequence length, meaning that using very large sequence length explodes in complexity/memory. So you cannot run the system with finite hardware even a very high-end GPU, because if we inference even a one hour long audio with Wav2Vec the system will crash. In this paper, we used some state-of-the-art methods to optimize the Wav2Vec model for better accuracy of predictions in resource constrained systems. In addition, we have performed tests with other SOTA models such as Citrinet and Quartznet for the comparative analysis.</abstract>
<identifier type="citekey">haswani-mohankumar-2022-methods</identifier>
<location>
<url>https://aclanthology.org/2022.icon-main.20</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>149</start>
<end>153</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Methods to Optimize Wav2Vec with Language Model for Automatic Speech Recognition in Resource Constrained Environment
%A Haswani, Vaibhav
%A Mohankumar, Padmapriya
%Y Akhtar, Md. Shad
%Y Chakraborty, Tanmoy
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON)
%D 2022
%8 December
%I Association for Computational Linguistics
%C New Delhi, India
%F haswani-mohankumar-2022-methods
%X Automatic Speech Recognition (ASR) on resource constrained environment is a complex task since most of the State-Of-The-Art models are combination of multilayered convolutional neural network (CNN) and Transformer models which itself requires huge resources such as GPU or TPU for training as well as inference. The accuracy as a performance metric of an ASR system depends upon the efficiency of phonemes to word translation of an Acoustic Model and context correction of the Language model. However, inference as a performance metric is also an important aspect, which mostly depends upon the resources. Also, most of the ASR models uses transformer models at its core and one caveat of transformers is that it usually has a finite amount of sequence length it can handle. Either because it uses position encodings or simply because the cost of attention in transformers is actually O(n²) in sequence length, meaning that using very large sequence length explodes in complexity/memory. So you cannot run the system with finite hardware even a very high-end GPU, because if we inference even a one hour long audio with Wav2Vec the system will crash. In this paper, we used some state-of-the-art methods to optimize the Wav2Vec model for better accuracy of predictions in resource constrained systems. In addition, we have performed tests with other SOTA models such as Citrinet and Quartznet for the comparative analysis.
%U https://aclanthology.org/2022.icon-main.20
%P 149-153
Markdown (Informal)
[Methods to Optimize Wav2Vec with Language Model for Automatic Speech Recognition in Resource Constrained Environment](https://aclanthology.org/2022.icon-main.20) (Haswani & Mohankumar, ICON 2022)
ACL