@inproceedings{obrien-etal-2023-gamli,
title = "Gamli - {I}celandic Oral History Corpus: Design, Collection and Evaluation",
author = "O{'}Brien, Luke and
Ingimundarson, Finnur and
Gu{\dh}nasson, J{\'o}n and
Steingr{\'\i}msson, Stein{\th}{\'o}r",
editor = {Alum{\"a}e, Tanel and
Fishel, Mark},
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may,
year = "2023",
address = "T{\'o}rshavn, Faroe Islands",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2023.nodalida-1.59",
pages = "601--609",
abstract = "We present Gamli, an ASR corpus for Icelandic oral histories, the first of its kind for this language, derived from the {\'I}sm{\'u}s ethnographic collection. Corpora for oral histories differ in various ways from corpora for general ASR, they contain spontaneous speech, multiple speakers per channel, noisy environments, the effects of historic recording equipment, and typically a large proportion of elderly speakers. Gamli contains 146 hours of aligned speech and transcripts, split into a training set and a test set. We describe our approach for creating the transcripts, through both OCR of previous transcripts and post-editing of ASR output. We also describe our approach for aligning, segmenting, and filtering the corpus and finally training a Kaldi ASR system, which achieves 22.4{\%} word error rate (WER) on the Gamli test set, a substantial improvement from 58.4{\%} word error rate from a baseline general ASR system for Icelandic.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="obrien-etal-2023-gamli">
<titleInfo>
<title>Gamli - Icelandic Oral History Corpus: Design, Collection and Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">O’Brien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Finnur</namePart>
<namePart type="family">Ingimundarson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jón</namePart>
<namePart type="family">Gu\dhnasson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stein\thór</namePart>
<namePart type="family">Steingrímsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tanel</namePart>
<namePart type="family">Alumäe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Fishel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tórshavn, Faroe Islands</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present Gamli, an ASR corpus for Icelandic oral histories, the first of its kind for this language, derived from the Ísmús ethnographic collection. Corpora for oral histories differ in various ways from corpora for general ASR, they contain spontaneous speech, multiple speakers per channel, noisy environments, the effects of historic recording equipment, and typically a large proportion of elderly speakers. Gamli contains 146 hours of aligned speech and transcripts, split into a training set and a test set. We describe our approach for creating the transcripts, through both OCR of previous transcripts and post-editing of ASR output. We also describe our approach for aligning, segmenting, and filtering the corpus and finally training a Kaldi ASR system, which achieves 22.4% word error rate (WER) on the Gamli test set, a substantial improvement from 58.4% word error rate from a baseline general ASR system for Icelandic.</abstract>
<identifier type="citekey">obrien-etal-2023-gamli</identifier>
<location>
<url>https://aclanthology.org/2023.nodalida-1.59</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>601</start>
<end>609</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Gamli - Icelandic Oral History Corpus: Design, Collection and Evaluation
%A O’Brien, Luke
%A Ingimundarson, Finnur
%A Gu\dhnasson, Jón
%A Steingrímsson, Stein\thór
%Y Alumäe, Tanel
%Y Fishel, Mark
%S Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)
%D 2023
%8 May
%I University of Tartu Library
%C Tórshavn, Faroe Islands
%F obrien-etal-2023-gamli
%X We present Gamli, an ASR corpus for Icelandic oral histories, the first of its kind for this language, derived from the Ísmús ethnographic collection. Corpora for oral histories differ in various ways from corpora for general ASR, they contain spontaneous speech, multiple speakers per channel, noisy environments, the effects of historic recording equipment, and typically a large proportion of elderly speakers. Gamli contains 146 hours of aligned speech and transcripts, split into a training set and a test set. We describe our approach for creating the transcripts, through both OCR of previous transcripts and post-editing of ASR output. We also describe our approach for aligning, segmenting, and filtering the corpus and finally training a Kaldi ASR system, which achieves 22.4% word error rate (WER) on the Gamli test set, a substantial improvement from 58.4% word error rate from a baseline general ASR system for Icelandic.
%U https://aclanthology.org/2023.nodalida-1.59
%P 601-609
Markdown (Informal)
[Gamli - Icelandic Oral History Corpus: Design, Collection and Evaluation](https://aclanthology.org/2023.nodalida-1.59) (O’Brien et al., NoDaLiDa 2023)
ACL