@inproceedings{monteiro-etal-2020-performance,
title = "On The Performance of Time-Pooling Strategies for End-to-End Spoken Language Identification",
author = "Monteiro, Joao and
Alam, Md Jahangir and
Falk, Tiago",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.438",
pages = "3566--3572",
abstract = "Automatic speech processing applications often have to deal with the problem of aggregating local descriptors (i.e., representations of input speech data corresponding to specific portions across the time dimension) and turning them into a single fixed-dimension representation, known as global descriptor, on top of which downstream classification tasks can be performed. In this paper, we provide an empirical assessment of different time pooling strategies when used with state-of-the-art representation learning models. In particular, insights are provided as to when it is suitable to use simple statistics of local descriptors or when more sophisticated approaches are needed. Here, language identification is used as a case study and a database containing ten oriental languages under varying test conditions (short-duration test recordings, confusing languages, unseen languages) is used. Experiments are performed with classifiers trained on top of global descriptors to provide insights on open-set evaluation performance and show that appropriate selection of such pooling strategies yield embeddings able to outperform well-known benchmark systems as well as previously results based on attention only.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="monteiro-etal-2020-performance">
<titleInfo>
<title>On The Performance of Time-Pooling Strategies for End-to-End Spoken Language Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joao</namePart>
<namePart type="family">Monteiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Jahangir</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Falk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Automatic speech processing applications often have to deal with the problem of aggregating local descriptors (i.e., representations of input speech data corresponding to specific portions across the time dimension) and turning them into a single fixed-dimension representation, known as global descriptor, on top of which downstream classification tasks can be performed. In this paper, we provide an empirical assessment of different time pooling strategies when used with state-of-the-art representation learning models. In particular, insights are provided as to when it is suitable to use simple statistics of local descriptors or when more sophisticated approaches are needed. Here, language identification is used as a case study and a database containing ten oriental languages under varying test conditions (short-duration test recordings, confusing languages, unseen languages) is used. Experiments are performed with classifiers trained on top of global descriptors to provide insights on open-set evaluation performance and show that appropriate selection of such pooling strategies yield embeddings able to outperform well-known benchmark systems as well as previously results based on attention only.</abstract>
<identifier type="citekey">monteiro-etal-2020-performance</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.438</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>3566</start>
<end>3572</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On The Performance of Time-Pooling Strategies for End-to-End Spoken Language Identification
%A Monteiro, Joao
%A Alam, Md Jahangir
%A Falk, Tiago
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F monteiro-etal-2020-performance
%X Automatic speech processing applications often have to deal with the problem of aggregating local descriptors (i.e., representations of input speech data corresponding to specific portions across the time dimension) and turning them into a single fixed-dimension representation, known as global descriptor, on top of which downstream classification tasks can be performed. In this paper, we provide an empirical assessment of different time pooling strategies when used with state-of-the-art representation learning models. In particular, insights are provided as to when it is suitable to use simple statistics of local descriptors or when more sophisticated approaches are needed. Here, language identification is used as a case study and a database containing ten oriental languages under varying test conditions (short-duration test recordings, confusing languages, unseen languages) is used. Experiments are performed with classifiers trained on top of global descriptors to provide insights on open-set evaluation performance and show that appropriate selection of such pooling strategies yield embeddings able to outperform well-known benchmark systems as well as previously results based on attention only.
%U https://aclanthology.org/2020.lrec-1.438
%P 3566-3572
Markdown (Informal)
[On The Performance of Time-Pooling Strategies for End-to-End Spoken Language Identification](https://aclanthology.org/2020.lrec-1.438) (Monteiro et al., LREC 2020)
ACL