@inproceedings{cavar-etal-2016-endangered,
title = "Endangered Language Documentation: Bootstrapping a Chatino Speech Corpus, Forced Aligner, {ASR}",
author = "{\'C}avar, Malgorzata and
{\'C}avar, Damir and
Cruz, Hilaria",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1632",
pages = "4004--4011",
abstract = "This project approaches the problem of language documentation and revitalization from a rather untraditional angle. To improve and facilitate language documentation of endangered languages, we attempt to use corpus linguistic methods and speech and language technologies to reduce the time needed for transcription and annotation of audio and video language recordings. The paper demonstrates this approach on the example of the endangered and seriously under-resourced variety of Eastern Chatino (CTP). We show how initial speech corpora can be created that can facilitate the development of speech and language technologies for under-resourced languages by utilizing Forced Alignment tools to time align transcriptions. Time-aligned transcriptions can be used to train speech corpora and utilize automatic speech recognition tools for the transcription and annotation of untranscribed data. Speech technologies can be used to reduce the time and effort necessary for transcription and annotation of large collections of audio and video recordings in digital language archives, addressing the transcription bottleneck problem that most language archives and many under-documented languages are confronted with. This approach can increase the availability of language resources from low-resourced and endangered languages to speech and language technology research and development.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cavar-etal-2016-endangered">
<titleInfo>
<title>Endangered Language Documentation: Bootstrapping a Chatino Speech Corpus, Forced Aligner, ASR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Malgorzata</namePart>
<namePart type="family">Ćavar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damir</namePart>
<namePart type="family">Ćavar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hilaria</namePart>
<namePart type="family">Cruz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This project approaches the problem of language documentation and revitalization from a rather untraditional angle. To improve and facilitate language documentation of endangered languages, we attempt to use corpus linguistic methods and speech and language technologies to reduce the time needed for transcription and annotation of audio and video language recordings. The paper demonstrates this approach on the example of the endangered and seriously under-resourced variety of Eastern Chatino (CTP). We show how initial speech corpora can be created that can facilitate the development of speech and language technologies for under-resourced languages by utilizing Forced Alignment tools to time align transcriptions. Time-aligned transcriptions can be used to train speech corpora and utilize automatic speech recognition tools for the transcription and annotation of untranscribed data. Speech technologies can be used to reduce the time and effort necessary for transcription and annotation of large collections of audio and video recordings in digital language archives, addressing the transcription bottleneck problem that most language archives and many under-documented languages are confronted with. This approach can increase the availability of language resources from low-resourced and endangered languages to speech and language technology research and development.</abstract>
<identifier type="citekey">cavar-etal-2016-endangered</identifier>
<location>
<url>https://aclanthology.org/L16-1632</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>4004</start>
<end>4011</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Endangered Language Documentation: Bootstrapping a Chatino Speech Corpus, Forced Aligner, ASR
%A Ćavar, Malgorzata
%A Ćavar, Damir
%A Cruz, Hilaria
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F cavar-etal-2016-endangered
%X This project approaches the problem of language documentation and revitalization from a rather untraditional angle. To improve and facilitate language documentation of endangered languages, we attempt to use corpus linguistic methods and speech and language technologies to reduce the time needed for transcription and annotation of audio and video language recordings. The paper demonstrates this approach on the example of the endangered and seriously under-resourced variety of Eastern Chatino (CTP). We show how initial speech corpora can be created that can facilitate the development of speech and language technologies for under-resourced languages by utilizing Forced Alignment tools to time align transcriptions. Time-aligned transcriptions can be used to train speech corpora and utilize automatic speech recognition tools for the transcription and annotation of untranscribed data. Speech technologies can be used to reduce the time and effort necessary for transcription and annotation of large collections of audio and video recordings in digital language archives, addressing the transcription bottleneck problem that most language archives and many under-documented languages are confronted with. This approach can increase the availability of language resources from low-resourced and endangered languages to speech and language technology research and development.
%U https://aclanthology.org/L16-1632
%P 4004-4011
Markdown (Informal)
[Endangered Language Documentation: Bootstrapping a Chatino Speech Corpus, Forced Aligner, ASR](https://aclanthology.org/L16-1632) (Ćavar et al., LREC 2016)
ACL