@inproceedings{aransa-etal-2012-semi,
title = "Semi-supervised transliteration mining from parallel and comparable corpora",
author = "Aransa, Walid and
Schwenk, Holger and
Barrault, Loic",
booktitle = "Proceedings of the 9th International Workshop on Spoken Language Translation: Papers",
month = dec # " 6-7",
year = "2012",
address = "Hong Kong, Table of contents",
url = "https://aclanthology.org/2012.iwslt-papers.6",
pages = "185--192",
abstract = "Transliteration is the process of writing a word (mainly proper noun) from one language in the alphabet of another language. This process requires mapping the pronunciation of the word from the source language to the closest possible pronunciation in the target language. In this paper we introduce a new semi-supervised transliteration mining method for parallel and comparable corpora. The method is mainly based on a new suggested Three Levels of Similarity (TLS) scores to extract the transliteration pairs. The first level calculates the similarity of of all vowel letters and consonants letters. The second level calculates the similarity of long vowels and vowel letters at beginning and end position of the words and consonants letters. The third level calculates the similarity consonants letters only. We applied our method on Arabic-English parallel and comparable corpora. We evaluated the extracted transliteration pairs using a statistical based transliteration system. This system is built using letters instead or words as tokens. The transliteration system achieves an accuracy of 0.50 and a mean F-score 0.8958 when trained on transliteration pairs extracted from a parallel corpus. The accuracy is 0.30 and the mean F-score 0.84 when we used instead a comparable corpus to automatically extract the transliteration pairs. This shows that the proposed semi-supervised transliteration mining algorithm is effective and can be applied to other language pairs. We also evaluated two segmentation techniques and reported the impact on the transliteration performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aransa-etal-2012-semi">
<titleInfo>
<title>Semi-supervised transliteration mining from parallel and comparable corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Walid</namePart>
<namePart type="family">Aransa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Holger</namePart>
<namePart type="family">Schwenk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loic</namePart>
<namePart type="family">Barrault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-dec 6-7</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th International Workshop on Spoken Language Translation: Papers</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">Hong Kong, Table of contents</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transliteration is the process of writing a word (mainly proper noun) from one language in the alphabet of another language. This process requires mapping the pronunciation of the word from the source language to the closest possible pronunciation in the target language. In this paper we introduce a new semi-supervised transliteration mining method for parallel and comparable corpora. The method is mainly based on a new suggested Three Levels of Similarity (TLS) scores to extract the transliteration pairs. The first level calculates the similarity of of all vowel letters and consonants letters. The second level calculates the similarity of long vowels and vowel letters at beginning and end position of the words and consonants letters. The third level calculates the similarity consonants letters only. We applied our method on Arabic-English parallel and comparable corpora. We evaluated the extracted transliteration pairs using a statistical based transliteration system. This system is built using letters instead or words as tokens. The transliteration system achieves an accuracy of 0.50 and a mean F-score 0.8958 when trained on transliteration pairs extracted from a parallel corpus. The accuracy is 0.30 and the mean F-score 0.84 when we used instead a comparable corpus to automatically extract the transliteration pairs. This shows that the proposed semi-supervised transliteration mining algorithm is effective and can be applied to other language pairs. We also evaluated two segmentation techniques and reported the impact on the transliteration performance.</abstract>
<identifier type="citekey">aransa-etal-2012-semi</identifier>
<location>
<url>https://aclanthology.org/2012.iwslt-papers.6</url>
</location>
<part>
<date>2012-dec 6-7</date>
<extent unit="page">
<start>185</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semi-supervised transliteration mining from parallel and comparable corpora
%A Aransa, Walid
%A Schwenk, Holger
%A Barrault, Loic
%S Proceedings of the 9th International Workshop on Spoken Language Translation: Papers
%D 2012
%8 dec 6 7
%C Hong Kong, Table of contents
%F aransa-etal-2012-semi
%X Transliteration is the process of writing a word (mainly proper noun) from one language in the alphabet of another language. This process requires mapping the pronunciation of the word from the source language to the closest possible pronunciation in the target language. In this paper we introduce a new semi-supervised transliteration mining method for parallel and comparable corpora. The method is mainly based on a new suggested Three Levels of Similarity (TLS) scores to extract the transliteration pairs. The first level calculates the similarity of of all vowel letters and consonants letters. The second level calculates the similarity of long vowels and vowel letters at beginning and end position of the words and consonants letters. The third level calculates the similarity consonants letters only. We applied our method on Arabic-English parallel and comparable corpora. We evaluated the extracted transliteration pairs using a statistical based transliteration system. This system is built using letters instead or words as tokens. The transliteration system achieves an accuracy of 0.50 and a mean F-score 0.8958 when trained on transliteration pairs extracted from a parallel corpus. The accuracy is 0.30 and the mean F-score 0.84 when we used instead a comparable corpus to automatically extract the transliteration pairs. This shows that the proposed semi-supervised transliteration mining algorithm is effective and can be applied to other language pairs. We also evaluated two segmentation techniques and reported the impact on the transliteration performance.
%U https://aclanthology.org/2012.iwslt-papers.6
%P 185-192
Markdown (Informal)
[Semi-supervised transliteration mining from parallel and comparable corpora](https://aclanthology.org/2012.iwslt-papers.6) (Aransa et al., IWSLT 2012)
ACL