@article{sajjad-etal-2017-statistical,
    title = "Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining",
    author = {Sajjad, Hassan  and
      Schmid, Helmut  and
      Fraser, Alexander  and
      Sch{\"u}tze, Hinrich},
    journal = "Computational Linguistics",
    volume = "43",
    number = "2",
    month = jun,
    year = "2017",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/J17-2003/",
    doi = "10.1162/COLI_a_00286",
    pages = "349--375",
    abstract = "We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2{\%} transliteration pairs, our system achieves up to 86.7{\%} F-measure with 77.9{\%} precision and 97.8{\%} recall."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sajjad-etal-2017-statistical">
    <titleInfo>
        <title>Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Hassan</namePart>
        <namePart type="family">Sajjad</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Helmut</namePart>
        <namePart type="family">Schmid</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Alexander</namePart>
        <namePart type="family">Fraser</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Hinrich</namePart>
        <namePart type="family">Schütze</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2017-06</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <genre authority="bibutilsgt">journal article</genre>
    <relatedItem type="host">
        <titleInfo>
            <title>Computational Linguistics</title>
        </titleInfo>
        <originInfo>
            <issuance>continuing</issuance>
            <publisher>MIT Press</publisher>
            <place>
                <placeTerm type="text">Cambridge, MA</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">periodical</genre>
        <genre authority="bibutilsgt">academic journal</genre>
    </relatedItem>
    <abstract>We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2% transliteration pairs, our system achieves up to 86.7% F-measure with 77.9% precision and 97.8% recall.</abstract>
    <identifier type="citekey">sajjad-etal-2017-statistical</identifier>
    <identifier type="doi">10.1162/COLI_a_00286</identifier>
    <location>
        <url>https://aclanthology.org/J17-2003/</url>
    </location>
    <part>
        <date>2017-06</date>
        <detail type="volume"><number>43</number></detail>
        <detail type="issue"><number>2</number></detail>
        <extent unit="page">
            <start>349</start>
            <end>375</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Journal Article
%T Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining
%A Sajjad, Hassan
%A Schmid, Helmut
%A Fraser, Alexander
%A Schütze, Hinrich
%J Computational Linguistics
%D 2017
%8 June
%V 43
%N 2
%I MIT Press
%C Cambridge, MA
%F sajjad-etal-2017-statistical
%X We present a generative model that efficiently mines transliteration pairs in a consistent fashion in three different settings: unsupervised, semi-supervised, and supervised transliteration mining. The model interpolates two sub-models, one for the generation of transliteration pairs and one for the generation of non-transliteration pairs (i.e., noise). The model is trained on noisy unlabeled data using the EM algorithm. During training the transliteration sub-model learns to generate transliteration pairs and the fixed non-transliteration model generates the noise pairs. After training, the unlabeled data is disambiguated based on the posterior probabilities of the two sub-models. We evaluate our transliteration mining system on data from a transliteration mining shared task and on parallel corpora. For three out of four language pairs, our system outperforms all semi-supervised and supervised systems that participated in the NEWS 2010 shared task. On word pairs extracted from parallel corpora with fewer than 2% transliteration pairs, our system achieves up to 86.7% F-measure with 77.9% precision and 97.8% recall.
%R 10.1162/COLI_a_00286
%U https://aclanthology.org/J17-2003/
%U https://doi.org/10.1162/COLI_a_00286
%P 349-375
Markdown (Informal)
[Statistical Models for Unsupervised, Semi-Supervised Supervised Transliteration Mining](https://aclanthology.org/J17-2003/) (Sajjad et al., CL 2017)
ACL