@inproceedings{elmahdy-etal-2014-automatic,
title = "Automatic Long Audio Alignment and Confidence Scoring for Conversational {A}rabic Speech",
author = "Elmahdy, Mohamed and
Hasegawa-Johnson, Mark and
Mustafawi, Eiman",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/434_Paper.pdf",
pages = "3062--3066",
abstract = "In this paper, a framework for long audio alignment for conversational Arabic speech is proposed. Accurate alignments help in many speech processing tasks such as audio indexing, speech recognizer acoustic model (AM) training, audio summarizing and retrieving, etc. We have collected more than 1,400 hours of conversational Arabic besides the corresponding human generated non-aligned transcriptions. Automatic audio segmentation is performed using a split and merge approach. A biased language model (LM) is trained using the corresponding text after a pre-processing stage. Because of the dominance of non-standard Arabic in conversational speech, a graphemic pronunciation model (PM) is utilized. The proposed alignment approach is performed in two passes. Firstly, a generic standard Arabic AM is used along with the biased LM and the graphemic PM in a fast speech recognition pass. In a second pass, a more restricted LM is generated for each audio segment, and unsupervised acoustic model adaptation is applied. The recognizer output is aligned with the processed transcriptions using Levenshtein algorithm. The proposed approach resulted in an initial alignment accuracy of 97.8-99.0{\%} depending on the amount of disfluencies. A confidence scoring metric is proposed to accept/reject aligner output. Using confidence scores, it was possible to reject the majority of mis-aligned segments resulting in alignment accuracy of 99.0-99.8{\%} depending on the speech domain and the amount of disfluencies.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elmahdy-etal-2014-automatic">
<titleInfo>
<title>Automatic Long Audio Alignment and Confidence Scoring for Conversational Arabic Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Elmahdy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Hasegawa-Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eiman</namePart>
<namePart type="family">Mustafawi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, a framework for long audio alignment for conversational Arabic speech is proposed. Accurate alignments help in many speech processing tasks such as audio indexing, speech recognizer acoustic model (AM) training, audio summarizing and retrieving, etc. We have collected more than 1,400 hours of conversational Arabic besides the corresponding human generated non-aligned transcriptions. Automatic audio segmentation is performed using a split and merge approach. A biased language model (LM) is trained using the corresponding text after a pre-processing stage. Because of the dominance of non-standard Arabic in conversational speech, a graphemic pronunciation model (PM) is utilized. The proposed alignment approach is performed in two passes. Firstly, a generic standard Arabic AM is used along with the biased LM and the graphemic PM in a fast speech recognition pass. In a second pass, a more restricted LM is generated for each audio segment, and unsupervised acoustic model adaptation is applied. The recognizer output is aligned with the processed transcriptions using Levenshtein algorithm. The proposed approach resulted in an initial alignment accuracy of 97.8-99.0% depending on the amount of disfluencies. A confidence scoring metric is proposed to accept/reject aligner output. Using confidence scores, it was possible to reject the majority of mis-aligned segments resulting in alignment accuracy of 99.0-99.8% depending on the speech domain and the amount of disfluencies.</abstract>
<identifier type="citekey">elmahdy-etal-2014-automatic</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/434_Paper.pdf</url>
</location>
<part>
<date>2014-05</date>
<extent unit="page">
<start>3062</start>
<end>3066</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Long Audio Alignment and Confidence Scoring for Conversational Arabic Speech
%A Elmahdy, Mohamed
%A Hasegawa-Johnson, Mark
%A Mustafawi, Eiman
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F elmahdy-etal-2014-automatic
%X In this paper, a framework for long audio alignment for conversational Arabic speech is proposed. Accurate alignments help in many speech processing tasks such as audio indexing, speech recognizer acoustic model (AM) training, audio summarizing and retrieving, etc. We have collected more than 1,400 hours of conversational Arabic besides the corresponding human generated non-aligned transcriptions. Automatic audio segmentation is performed using a split and merge approach. A biased language model (LM) is trained using the corresponding text after a pre-processing stage. Because of the dominance of non-standard Arabic in conversational speech, a graphemic pronunciation model (PM) is utilized. The proposed alignment approach is performed in two passes. Firstly, a generic standard Arabic AM is used along with the biased LM and the graphemic PM in a fast speech recognition pass. In a second pass, a more restricted LM is generated for each audio segment, and unsupervised acoustic model adaptation is applied. The recognizer output is aligned with the processed transcriptions using Levenshtein algorithm. The proposed approach resulted in an initial alignment accuracy of 97.8-99.0% depending on the amount of disfluencies. A confidence scoring metric is proposed to accept/reject aligner output. Using confidence scores, it was possible to reject the majority of mis-aligned segments resulting in alignment accuracy of 99.0-99.8% depending on the speech domain and the amount of disfluencies.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/434_Paper.pdf
%P 3062-3066
Markdown (Informal)
[Automatic Long Audio Alignment and Confidence Scoring for Conversational Arabic Speech](http://www.lrec-conf.org/proceedings/lrec2014/pdf/434_Paper.pdf) (Elmahdy et al., LREC 2014)
ACL