@inproceedings{wisniewski-etal-2020-phonemic,
title = "Phonemic Transcription of Low-Resource Languages: To What Extent can Preprocessing be Automated?",
author = "Wisniewski, Guillaume and
Guillaume, S{\'e}verine and
Michaud, Alexis",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.43",
pages = "306--315",
abstract = "Automatic Speech Recognition for low-resource languages has been an active field of research for more than a decade. It holds promise for facilitating the urgent task of documenting the world{'}s dwindling linguistic diversity. Various methodological hurdles are encountered in the course of this exciting development, however. A well-identified difficulty is that data preprocessing is not at all trivial: data collected in classical fieldwork are usually tailored to the needs of the linguist who collects them, and there is baffling diversity in formats and annotation schema, even among fieldworkers who use the same software package (such as ELAN). The tests reported here (on Yongning Na and other languages from the Pangloss Collection, an open archive of endangered languages) explore some possibilities for automating the process of data preprocessing: assessing to what extent it is possible to bypass the involvement of language experts for menial tasks of data preparation for Natural Language Processing (NLP) purposes. What is at stake is the accessibility of language archive data for a range of NLP tasks and beyond.",
language = "English",
ISBN = "979-10-95546-35-1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wisniewski-etal-2020-phonemic">
<titleInfo>
<title>Phonemic Transcription of Low-Resource Languages: To What Extent can Preprocessing be Automated?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillaume</namePart>
<namePart type="family">Wisniewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Séverine</namePart>
<namePart type="family">Guillaume</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Michaud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>Automatic Speech Recognition for low-resource languages has been an active field of research for more than a decade. It holds promise for facilitating the urgent task of documenting the world’s dwindling linguistic diversity. Various methodological hurdles are encountered in the course of this exciting development, however. A well-identified difficulty is that data preprocessing is not at all trivial: data collected in classical fieldwork are usually tailored to the needs of the linguist who collects them, and there is baffling diversity in formats and annotation schema, even among fieldworkers who use the same software package (such as ELAN). The tests reported here (on Yongning Na and other languages from the Pangloss Collection, an open archive of endangered languages) explore some possibilities for automating the process of data preprocessing: assessing to what extent it is possible to bypass the involvement of language experts for menial tasks of data preparation for Natural Language Processing (NLP) purposes. What is at stake is the accessibility of language archive data for a range of NLP tasks and beyond.</abstract>
<identifier type="citekey">wisniewski-etal-2020-phonemic</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.43</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>306</start>
<end>315</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Phonemic Transcription of Low-Resource Languages: To What Extent can Preprocessing be Automated?
%A Wisniewski, Guillaume
%A Guillaume, Séverine
%A Michaud, Alexis
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G English
%F wisniewski-etal-2020-phonemic
%X Automatic Speech Recognition for low-resource languages has been an active field of research for more than a decade. It holds promise for facilitating the urgent task of documenting the world’s dwindling linguistic diversity. Various methodological hurdles are encountered in the course of this exciting development, however. A well-identified difficulty is that data preprocessing is not at all trivial: data collected in classical fieldwork are usually tailored to the needs of the linguist who collects them, and there is baffling diversity in formats and annotation schema, even among fieldworkers who use the same software package (such as ELAN). The tests reported here (on Yongning Na and other languages from the Pangloss Collection, an open archive of endangered languages) explore some possibilities for automating the process of data preprocessing: assessing to what extent it is possible to bypass the involvement of language experts for menial tasks of data preparation for Natural Language Processing (NLP) purposes. What is at stake is the accessibility of language archive data for a range of NLP tasks and beyond.
%U https://aclanthology.org/2020.sltu-1.43
%P 306-315
Markdown (Informal)
[Phonemic Transcription of Low-Resource Languages: To What Extent can Preprocessing be Automated?](https://aclanthology.org/2020.sltu-1.43) (Wisniewski et al., SLTU 2020)
ACL