@inproceedings{han-bel-2016-towards,
title = "Towards producing bilingual lexica from monolingual corpora",
author = "Han, Jingyi and
Bel, N{\'u}ria",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1353",
pages = "2222--2227",
abstract = "Bilingual lexica are the basis for many cross-lingual natural language processing tasks. Recent works have shown success in learning bilingual dictionary by taking advantages of comparable corpora and a diverse set of signals derived from monolingual corpora. In the present work, we describe an approach to automatically learn bilingual lexica by training a supervised classifier using word embedding-based vectors of only a few hundred translation equivalent word pairs. The word embedding representations of translation pairs were obtained from source and target monolingual corpora, which are not necessarily related. Our classifier is able to predict whether a new word pair is under a translation relation or not. We tested it on two quite distinct language pairs Chinese-Spanish and English-Spanish. The classifiers achieved more than 0.90 precision and recall for both language pairs in different evaluation scenarios. These results show a high potential for this method to be used in bilingual lexica production for language pairs with reduced amount of parallel or comparable corpora, in particular for phrase table expansion in Statistical Machine Translation systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-bel-2016-towards">
<titleInfo>
<title>Towards producing bilingual lexica from monolingual corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jingyi</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Núria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Bilingual lexica are the basis for many cross-lingual natural language processing tasks. Recent works have shown success in learning bilingual dictionary by taking advantages of comparable corpora and a diverse set of signals derived from monolingual corpora. In the present work, we describe an approach to automatically learn bilingual lexica by training a supervised classifier using word embedding-based vectors of only a few hundred translation equivalent word pairs. The word embedding representations of translation pairs were obtained from source and target monolingual corpora, which are not necessarily related. Our classifier is able to predict whether a new word pair is under a translation relation or not. We tested it on two quite distinct language pairs Chinese-Spanish and English-Spanish. The classifiers achieved more than 0.90 precision and recall for both language pairs in different evaluation scenarios. These results show a high potential for this method to be used in bilingual lexica production for language pairs with reduced amount of parallel or comparable corpora, in particular for phrase table expansion in Statistical Machine Translation systems.</abstract>
<identifier type="citekey">han-bel-2016-towards</identifier>
<location>
<url>https://aclanthology.org/L16-1353</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>2222</start>
<end>2227</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards producing bilingual lexica from monolingual corpora
%A Han, Jingyi
%A Bel, Núria
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F han-bel-2016-towards
%X Bilingual lexica are the basis for many cross-lingual natural language processing tasks. Recent works have shown success in learning bilingual dictionary by taking advantages of comparable corpora and a diverse set of signals derived from monolingual corpora. In the present work, we describe an approach to automatically learn bilingual lexica by training a supervised classifier using word embedding-based vectors of only a few hundred translation equivalent word pairs. The word embedding representations of translation pairs were obtained from source and target monolingual corpora, which are not necessarily related. Our classifier is able to predict whether a new word pair is under a translation relation or not. We tested it on two quite distinct language pairs Chinese-Spanish and English-Spanish. The classifiers achieved more than 0.90 precision and recall for both language pairs in different evaluation scenarios. These results show a high potential for this method to be used in bilingual lexica production for language pairs with reduced amount of parallel or comparable corpora, in particular for phrase table expansion in Statistical Machine Translation systems.
%U https://aclanthology.org/L16-1353
%P 2222-2227
Markdown (Informal)
[Towards producing bilingual lexica from monolingual corpora](https://aclanthology.org/L16-1353) (Han & Bel, LREC 2016)
ACL