@inproceedings{lo-simard-2019-fully,
title = "Fully Unsupervised Crosslingual Semantic Textual Similarity Metric Based on {BERT} for Identifying Parallel Data",
author = "Lo, Chi-kiu and
Simard, Michel",
editor = "Bansal, Mohit and
Villavicencio, Aline",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K19-1020/",
doi = "10.18653/v1/K19-1020",
pages = "206--215",
abstract = "We present a fully unsupervised crosslingual semantic textual similarity (STS) metric, based on contextual embeddings extracted from BERT {--} Bidirectional Encoder Representations from Transformers (Devlin et al., 2019). The goal of crosslingual STS is to measure to what degree two segments of text in different languages express the same meaning. Not only is it a key task in crosslingual natural language understanding (XLU), it is also particularly useful for identifying parallel resources for training and evaluating downstream multilingual natural language processing (NLP) applications, such as machine translation. Most previous crosslingual STS methods relied heavily on existing parallel resources, thus leading to a circular dependency problem. With the advent of massively multilingual context representation models such as BERT, which are trained on the concatenation of non-parallel data from each language, we show that the deadlock around parallel resources can be broken. We perform intrinsic evaluations on crosslingual STS data sets and extrinsic evaluations on parallel corpus filtering and human translation equivalence assessment tasks. Our results show that the unsupervised crosslingual STS metric using BERT without fine-tuning achieves performance on par with supervised or weakly supervised approaches."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lo-simard-2019-fully">
<titleInfo>
<title>Fully Unsupervised Crosslingual Semantic Textual Similarity Metric Based on BERT for Identifying Parallel Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi-kiu</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a fully unsupervised crosslingual semantic textual similarity (STS) metric, based on contextual embeddings extracted from BERT – Bidirectional Encoder Representations from Transformers (Devlin et al., 2019). The goal of crosslingual STS is to measure to what degree two segments of text in different languages express the same meaning. Not only is it a key task in crosslingual natural language understanding (XLU), it is also particularly useful for identifying parallel resources for training and evaluating downstream multilingual natural language processing (NLP) applications, such as machine translation. Most previous crosslingual STS methods relied heavily on existing parallel resources, thus leading to a circular dependency problem. With the advent of massively multilingual context representation models such as BERT, which are trained on the concatenation of non-parallel data from each language, we show that the deadlock around parallel resources can be broken. We perform intrinsic evaluations on crosslingual STS data sets and extrinsic evaluations on parallel corpus filtering and human translation equivalence assessment tasks. Our results show that the unsupervised crosslingual STS metric using BERT without fine-tuning achieves performance on par with supervised or weakly supervised approaches.</abstract>
<identifier type="citekey">lo-simard-2019-fully</identifier>
<identifier type="doi">10.18653/v1/K19-1020</identifier>
<location>
<url>https://aclanthology.org/K19-1020/</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>206</start>
<end>215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fully Unsupervised Crosslingual Semantic Textual Similarity Metric Based on BERT for Identifying Parallel Data
%A Lo, Chi-kiu
%A Simard, Michel
%Y Bansal, Mohit
%Y Villavicencio, Aline
%S Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F lo-simard-2019-fully
%X We present a fully unsupervised crosslingual semantic textual similarity (STS) metric, based on contextual embeddings extracted from BERT – Bidirectional Encoder Representations from Transformers (Devlin et al., 2019). The goal of crosslingual STS is to measure to what degree two segments of text in different languages express the same meaning. Not only is it a key task in crosslingual natural language understanding (XLU), it is also particularly useful for identifying parallel resources for training and evaluating downstream multilingual natural language processing (NLP) applications, such as machine translation. Most previous crosslingual STS methods relied heavily on existing parallel resources, thus leading to a circular dependency problem. With the advent of massively multilingual context representation models such as BERT, which are trained on the concatenation of non-parallel data from each language, we show that the deadlock around parallel resources can be broken. We perform intrinsic evaluations on crosslingual STS data sets and extrinsic evaluations on parallel corpus filtering and human translation equivalence assessment tasks. Our results show that the unsupervised crosslingual STS metric using BERT without fine-tuning achieves performance on par with supervised or weakly supervised approaches.
%R 10.18653/v1/K19-1020
%U https://aclanthology.org/K19-1020/
%U https://doi.org/10.18653/v1/K19-1020
%P 206-215
Markdown (Informal)
[Fully Unsupervised Crosslingual Semantic Textual Similarity Metric Based on BERT for Identifying Parallel Data](https://aclanthology.org/K19-1020/) (Lo & Simard, CoNLL 2019)
ACL