@inproceedings{maxwelll-smith-foley-2023-automated,
title = "Automated speech recognition of {I}ndonesian-{E}nglish language lessons on {Y}ou{T}ube using transfer learning",
author = "Maxwell-Smith, Zara and
Foley, Ben",
editor = "Serikov, Oleg and
Voloshina, Ekaterina and
Postnikova, Anna and
Klyachko, Elena and
Vylomova, Ekaterina and
Shavrina, Tatiana and
Le Ferrand, Eric and
Malykh, Valentin and
Tyers, Francis and
Arkhangelskiy, Timofey and
Mikhailov, Vladislav",
booktitle = "Proceedings of the Second Workshop on NLP Applications to Field Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.fieldmatters-1.1",
doi = "10.18653/v1/2023.fieldmatters-1.1",
pages = "1--16",
abstract = "Experiments to fine-tune large multilingual models with limited data from a specific domain or setting has potential to improve automatic speech recognition (ASR) outcomes. This paper reports on the use of the Elpis ASR pipeline to fine-tune two pre-trained base models, Wav2Vec2-XLSR-53 and Wav2Vec2-Large-XLSR-Indonesian, with various mixes of data from 3 YouTube channels teaching Indonesian with English as the language of instruction. We discuss our results inferring new lesson audio (22-46{\%} word error rate) in the context of speeding data collection in diverse and specialised settings. This study is an example of how ASR can be used to accelerate natural language research, expanding ethically sourced data in low-resource settings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maxwelll-smith-foley-2023-automated">
<titleInfo>
<title>Automated speech recognition of Indonesian-English language lessons on YouTube using transfer learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zara</namePart>
<namePart type="family">Maxwell-Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Foley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on NLP Applications to Field Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Postnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Klyachko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Le Ferrand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Tyers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timofey</namePart>
<namePart type="family">Arkhangelskiy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Experiments to fine-tune large multilingual models with limited data from a specific domain or setting has potential to improve automatic speech recognition (ASR) outcomes. This paper reports on the use of the Elpis ASR pipeline to fine-tune two pre-trained base models, Wav2Vec2-XLSR-53 and Wav2Vec2-Large-XLSR-Indonesian, with various mixes of data from 3 YouTube channels teaching Indonesian with English as the language of instruction. We discuss our results inferring new lesson audio (22-46% word error rate) in the context of speeding data collection in diverse and specialised settings. This study is an example of how ASR can be used to accelerate natural language research, expanding ethically sourced data in low-resource settings.</abstract>
<identifier type="citekey">maxwelll-smith-foley-2023-automated</identifier>
<identifier type="doi">10.18653/v1/2023.fieldmatters-1.1</identifier>
<location>
<url>https://aclanthology.org/2023.fieldmatters-1.1</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automated speech recognition of Indonesian-English language lessons on YouTube using transfer learning
%A Maxwell-Smith, Zara
%A Foley, Ben
%Y Serikov, Oleg
%Y Voloshina, Ekaterina
%Y Postnikova, Anna
%Y Klyachko, Elena
%Y Vylomova, Ekaterina
%Y Shavrina, Tatiana
%Y Le Ferrand, Eric
%Y Malykh, Valentin
%Y Tyers, Francis
%Y Arkhangelskiy, Timofey
%Y Mikhailov, Vladislav
%S Proceedings of the Second Workshop on NLP Applications to Field Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F maxwelll-smith-foley-2023-automated
%X Experiments to fine-tune large multilingual models with limited data from a specific domain or setting has potential to improve automatic speech recognition (ASR) outcomes. This paper reports on the use of the Elpis ASR pipeline to fine-tune two pre-trained base models, Wav2Vec2-XLSR-53 and Wav2Vec2-Large-XLSR-Indonesian, with various mixes of data from 3 YouTube channels teaching Indonesian with English as the language of instruction. We discuss our results inferring new lesson audio (22-46% word error rate) in the context of speeding data collection in diverse and specialised settings. This study is an example of how ASR can be used to accelerate natural language research, expanding ethically sourced data in low-resource settings.
%R 10.18653/v1/2023.fieldmatters-1.1
%U https://aclanthology.org/2023.fieldmatters-1.1
%U https://doi.org/10.18653/v1/2023.fieldmatters-1.1
%P 1-16
Markdown (Informal)
[Automated speech recognition of Indonesian-English language lessons on YouTube using transfer learning](https://aclanthology.org/2023.fieldmatters-1.1) (Maxwell-Smith & Foley, FieldMatters 2023)
ACL