@inproceedings{san-etal-2022-automated,
title = "Automated speech tools for helping communities process restricted-access corpora for language revival efforts",
author = "San, Nay and
Bartelds, Martijn and
Ogunremi, Tolulope and
Mount, Alison and
Thompson, Ruben and
Higgins, Michael and
Barker, Roy and
Simpson, Jane and
Jurafsky, Dan",
booktitle = "Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.computel-1.6",
doi = "10.18653/v1/2022.computel-1.6",
pages = "41--51",
abstract = "Many archival recordings of speech from endangered languages remain unannotated and inaccessible to community members and language learning programs. One bottleneck is the time-intensive nature of annotation. An even narrower bottleneck occurs for recordings with access constraints, such as language that must be vetted or filtered by authorised community members before annotation can begin. We propose a privacy-preserving workflow to widen both bottlenecks for recordings where speech in the endangered language is intermixed with a more widely-used language such as English for meta-linguistic commentary and questions (e.g.What is the word for {`}tree{'}?). We integrate voice activity detection (VAD), spoken language identification (SLI), and automatic speech recognition (ASR) to transcribe the metalinguistic content, which an authorised person can quickly scan to triage recordings that can be annotated by people with lower levels of access. We report work-in-progress processing 136 hours archival audio containing a mix of English and Muruwari. Our collaborative work with the Muruwari custodian of the archival materials show that this workflow reduces metalanguage transcription time by 20{\%} even given only minimal amounts of annotated training data, 10 utterances per language for SLI and for ASR at most 39 minutes, and possibly as little as 39 seconds.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="san-etal-2022-automated">
<titleInfo>
<title>Automated speech tools for helping communities process restricted-access corpora for language revival efforts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nay</namePart>
<namePart type="family">San</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martijn</namePart>
<namePart type="family">Bartelds</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tolulope</namePart>
<namePart type="family">Ogunremi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alison</namePart>
<namePart type="family">Mount</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruben</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Higgins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roy</namePart>
<namePart type="family">Barker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jane</namePart>
<namePart type="family">Simpson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many archival recordings of speech from endangered languages remain unannotated and inaccessible to community members and language learning programs. One bottleneck is the time-intensive nature of annotation. An even narrower bottleneck occurs for recordings with access constraints, such as language that must be vetted or filtered by authorised community members before annotation can begin. We propose a privacy-preserving workflow to widen both bottlenecks for recordings where speech in the endangered language is intermixed with a more widely-used language such as English for meta-linguistic commentary and questions (e.g.What is the word for ‘tree’?). We integrate voice activity detection (VAD), spoken language identification (SLI), and automatic speech recognition (ASR) to transcribe the metalinguistic content, which an authorised person can quickly scan to triage recordings that can be annotated by people with lower levels of access. We report work-in-progress processing 136 hours archival audio containing a mix of English and Muruwari. Our collaborative work with the Muruwari custodian of the archival materials show that this workflow reduces metalanguage transcription time by 20% even given only minimal amounts of annotated training data, 10 utterances per language for SLI and for ASR at most 39 minutes, and possibly as little as 39 seconds.</abstract>
<identifier type="citekey">san-etal-2022-automated</identifier>
<identifier type="doi">10.18653/v1/2022.computel-1.6</identifier>
<location>
<url>https://aclanthology.org/2022.computel-1.6</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>41</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automated speech tools for helping communities process restricted-access corpora for language revival efforts
%A San, Nay
%A Bartelds, Martijn
%A Ogunremi, Tolulope
%A Mount, Alison
%A Thompson, Ruben
%A Higgins, Michael
%A Barker, Roy
%A Simpson, Jane
%A Jurafsky, Dan
%S Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F san-etal-2022-automated
%X Many archival recordings of speech from endangered languages remain unannotated and inaccessible to community members and language learning programs. One bottleneck is the time-intensive nature of annotation. An even narrower bottleneck occurs for recordings with access constraints, such as language that must be vetted or filtered by authorised community members before annotation can begin. We propose a privacy-preserving workflow to widen both bottlenecks for recordings where speech in the endangered language is intermixed with a more widely-used language such as English for meta-linguistic commentary and questions (e.g.What is the word for ‘tree’?). We integrate voice activity detection (VAD), spoken language identification (SLI), and automatic speech recognition (ASR) to transcribe the metalinguistic content, which an authorised person can quickly scan to triage recordings that can be annotated by people with lower levels of access. We report work-in-progress processing 136 hours archival audio containing a mix of English and Muruwari. Our collaborative work with the Muruwari custodian of the archival materials show that this workflow reduces metalanguage transcription time by 20% even given only minimal amounts of annotated training data, 10 utterances per language for SLI and for ASR at most 39 minutes, and possibly as little as 39 seconds.
%R 10.18653/v1/2022.computel-1.6
%U https://aclanthology.org/2022.computel-1.6
%U https://doi.org/10.18653/v1/2022.computel-1.6
%P 41-51
Markdown (Informal)
[Automated speech tools for helping communities process restricted-access corpora for language revival efforts](https://aclanthology.org/2022.computel-1.6) (San et al., ComputEL 2022)
ACL
- Nay San, Martijn Bartelds, Tolulope Ogunremi, Alison Mount, Ruben Thompson, Michael Higgins, Roy Barker, Jane Simpson, and Dan Jurafsky. 2022. Automated speech tools for helping communities process restricted-access corpora for language revival efforts. In Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages, pages 41–51, Dublin, Ireland. Association for Computational Linguistics.