@inproceedings{marchisio-etal-2021-alignment,
title = "An Alignment-Based Approach to Semi-Supervised Bilingual Lexicon Induction with Small Parallel Corpora",
author = "Marchisio, Kelly and
Koehn, Philipp and
Xiong, Conghao",
editor = "Duh, Kevin and
Guzm{\'a}n, Francisco",
booktitle = "Proceedings of Machine Translation Summit XVIII: Research Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-research.24",
pages = "293--304",
abstract = "Aimed at generating a seed lexicon for use in downstream natural language tasks and unsupervised methods for bilingual lexicon induction have received much attention in the academic literature recently. While interesting and fully unsupervised settings are unrealistic; small amounts of bilingual data are usually available due to the existence of massively multilingual parallel corpora and or linguists can create small amounts of parallel data. In this work and we demonstrate an effective bootstrapping approach for semi-supervised bilingual lexicon induction that capitalizes upon the complementary strengths of two disparate methods for inducing bilingual lexicons. Whereas statistical methods are highly effective at inducing correct translation pairs for words frequently occurring in a parallel corpus and monolingual embedding spaces have the advantage of having been trained on large amounts of data and and therefore may induce accurate translations for words absent from the small corpus. By combining these relative strengths and our method achieves state-of-the-art results on 3 of 4 language pairs in the challenging VecMap test set using minimal amounts of parallel data and without the need for a translation dictionary. We release our implementation at www.blind-review.code.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marchisio-etal-2021-alignment">
<titleInfo>
<title>An Alignment-Based Approach to Semi-Supervised Bilingual Lexicon Induction with Small Parallel Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Marchisio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Conghao</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit XVIII: Research Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francisco</namePart>
<namePart type="family">Guzmán</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Aimed at generating a seed lexicon for use in downstream natural language tasks and unsupervised methods for bilingual lexicon induction have received much attention in the academic literature recently. While interesting and fully unsupervised settings are unrealistic; small amounts of bilingual data are usually available due to the existence of massively multilingual parallel corpora and or linguists can create small amounts of parallel data. In this work and we demonstrate an effective bootstrapping approach for semi-supervised bilingual lexicon induction that capitalizes upon the complementary strengths of two disparate methods for inducing bilingual lexicons. Whereas statistical methods are highly effective at inducing correct translation pairs for words frequently occurring in a parallel corpus and monolingual embedding spaces have the advantage of having been trained on large amounts of data and and therefore may induce accurate translations for words absent from the small corpus. By combining these relative strengths and our method achieves state-of-the-art results on 3 of 4 language pairs in the challenging VecMap test set using minimal amounts of parallel data and without the need for a translation dictionary. We release our implementation at www.blind-review.code.</abstract>
<identifier type="citekey">marchisio-etal-2021-alignment</identifier>
<location>
<url>https://aclanthology.org/2021.mtsummit-research.24</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>293</start>
<end>304</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Alignment-Based Approach to Semi-Supervised Bilingual Lexicon Induction with Small Parallel Corpora
%A Marchisio, Kelly
%A Koehn, Philipp
%A Xiong, Conghao
%Y Duh, Kevin
%Y Guzmán, Francisco
%S Proceedings of Machine Translation Summit XVIII: Research Track
%D 2021
%8 August
%I Association for Machine Translation in the Americas
%C Virtual
%F marchisio-etal-2021-alignment
%X Aimed at generating a seed lexicon for use in downstream natural language tasks and unsupervised methods for bilingual lexicon induction have received much attention in the academic literature recently. While interesting and fully unsupervised settings are unrealistic; small amounts of bilingual data are usually available due to the existence of massively multilingual parallel corpora and or linguists can create small amounts of parallel data. In this work and we demonstrate an effective bootstrapping approach for semi-supervised bilingual lexicon induction that capitalizes upon the complementary strengths of two disparate methods for inducing bilingual lexicons. Whereas statistical methods are highly effective at inducing correct translation pairs for words frequently occurring in a parallel corpus and monolingual embedding spaces have the advantage of having been trained on large amounts of data and and therefore may induce accurate translations for words absent from the small corpus. By combining these relative strengths and our method achieves state-of-the-art results on 3 of 4 language pairs in the challenging VecMap test set using minimal amounts of parallel data and without the need for a translation dictionary. We release our implementation at www.blind-review.code.
%U https://aclanthology.org/2021.mtsummit-research.24
%P 293-304
Markdown (Informal)
[An Alignment-Based Approach to Semi-Supervised Bilingual Lexicon Induction with Small Parallel Corpora](https://aclanthology.org/2021.mtsummit-research.24) (Marchisio et al., MTSummit 2021)
ACL