@inproceedings{p-mamidi-2024-towards,
title = "Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models",
author = "P, Rahothvarman and
Mamidi, Radhika",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.31/",
pages = "264--268",
abstract = "Open Vocabulary Keyword Spotting is essential in numerous applications, from virtual assistants to security systems, as it allows systems to identify specific words or phrases in continuous speech. In this paper, we propose a novel end-to-end method for detecting user-defined open vocabulary keywords by leveraging linguistic patterns for the correlation between audio and text modalities. Our approach utilizes quantized pre-trained foundation models for robust audio embeddings and a unique lightweight Multi-Scale Linear Attention (MSLA) network that aligns speech and text representations for effective cross-modal agreement. We evaluate our method on two distinct datasets, comparing its performance against other baselines. The results highlight the effectiveness of our approach, achieving significant improvements over the Cross-Modality Correspondence Detector (CMCD) method, with a 16.08{\%} increase in AUC and a 17.2{\%} reduction in EER metrics on the Google Speech Commands dataset. These findings demonstrate the potential of our method to advance keyword spotting across various real-world applications."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="p-mamidi-2024-towards">
<titleInfo>
<title>Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rahothvarman</namePart>
<namePart type="family">P</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radhika</namePart>
<namePart type="family">Mamidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Open Vocabulary Keyword Spotting is essential in numerous applications, from virtual assistants to security systems, as it allows systems to identify specific words or phrases in continuous speech. In this paper, we propose a novel end-to-end method for detecting user-defined open vocabulary keywords by leveraging linguistic patterns for the correlation between audio and text modalities. Our approach utilizes quantized pre-trained foundation models for robust audio embeddings and a unique lightweight Multi-Scale Linear Attention (MSLA) network that aligns speech and text representations for effective cross-modal agreement. We evaluate our method on two distinct datasets, comparing its performance against other baselines. The results highlight the effectiveness of our approach, achieving significant improvements over the Cross-Modality Correspondence Detector (CMCD) method, with a 16.08% increase in AUC and a 17.2% reduction in EER metrics on the Google Speech Commands dataset. These findings demonstrate the potential of our method to advance keyword spotting across various real-world applications.</abstract>
<identifier type="citekey">p-mamidi-2024-towards</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.31/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>264</start>
<end>268</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models
%A P, Rahothvarman
%A Mamidi, Radhika
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F p-mamidi-2024-towards
%X Open Vocabulary Keyword Spotting is essential in numerous applications, from virtual assistants to security systems, as it allows systems to identify specific words or phrases in continuous speech. In this paper, we propose a novel end-to-end method for detecting user-defined open vocabulary keywords by leveraging linguistic patterns for the correlation between audio and text modalities. Our approach utilizes quantized pre-trained foundation models for robust audio embeddings and a unique lightweight Multi-Scale Linear Attention (MSLA) network that aligns speech and text representations for effective cross-modal agreement. We evaluate our method on two distinct datasets, comparing its performance against other baselines. The results highlight the effectiveness of our approach, achieving significant improvements over the Cross-Modality Correspondence Detector (CMCD) method, with a 16.08% increase in AUC and a 17.2% reduction in EER metrics on the Google Speech Commands dataset. These findings demonstrate the potential of our method to advance keyword spotting across various real-world applications.
%U https://aclanthology.org/2024.icon-1.31/
%P 264-268
Markdown (Informal)
[Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models](https://aclanthology.org/2024.icon-1.31/) (P & Mamidi, ICON 2024)
ACL