@article{choenni-shutova-2022-investigating,
title = "Investigating Language Relationships in Multilingual Sentence Encoders Through the Lens of Linguistic Typology",
author = "Choenni, Rochelle and
Shutova, Ekaterina",
journal = "Computational Linguistics",
volume = "48",
number = "3",
month = sep,
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.cl-3.5",
doi = "10.1162/coli_a_00444",
pages = "635--672",
abstract = "Multilingual sentence encoders have seen much success in cross-lingual model transfer for downstream NLP tasks. The success of this transfer is, however, dependent on the model{'}s ability to encode the patterns of cross-lingual similarity and variation. Yet, we know relatively little about the properties of individual languages or the general patterns of linguistic variation that the models encode. In this article, we investigate these questions by leveraging knowledge from the field of linguistic typology, which studies and documents structural and semantic variation across languages. We propose methods for separating language-specific subspaces within state-of-the-art multilingual sentence encoders (LASER, M-BERT, XLM, and XLM-R) with respect to a range of typological properties pertaining to lexical, morphological, and syntactic structure. Moreover, we investigate how typological information about languages is distributed across all layers of the models. Our results show interesting differences in encoding linguistic variation associated with different pretraining strategies. In addition, we propose a simple method to study how shared typological properties of languages are encoded in two state-of-the-art multilingual models{---}M-BERT and XLM-R. The results provide insight into their information-sharing mechanisms and suggest that these linguistic properties are encoded jointly across typologically similar languages in these models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="choenni-shutova-2022-investigating">
<titleInfo>
<title>Investigating Language Relationships in Multilingual Sentence Encoders Through the Lens of Linguistic Typology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rochelle</namePart>
<namePart type="family">Choenni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Multilingual sentence encoders have seen much success in cross-lingual model transfer for downstream NLP tasks. The success of this transfer is, however, dependent on the model’s ability to encode the patterns of cross-lingual similarity and variation. Yet, we know relatively little about the properties of individual languages or the general patterns of linguistic variation that the models encode. In this article, we investigate these questions by leveraging knowledge from the field of linguistic typology, which studies and documents structural and semantic variation across languages. We propose methods for separating language-specific subspaces within state-of-the-art multilingual sentence encoders (LASER, M-BERT, XLM, and XLM-R) with respect to a range of typological properties pertaining to lexical, morphological, and syntactic structure. Moreover, we investigate how typological information about languages is distributed across all layers of the models. Our results show interesting differences in encoding linguistic variation associated with different pretraining strategies. In addition, we propose a simple method to study how shared typological properties of languages are encoded in two state-of-the-art multilingual models—M-BERT and XLM-R. The results provide insight into their information-sharing mechanisms and suggest that these linguistic properties are encoded jointly across typologically similar languages in these models.</abstract>
<identifier type="citekey">choenni-shutova-2022-investigating</identifier>
<identifier type="doi">10.1162/coli_a_00444</identifier>
<location>
<url>https://aclanthology.org/2022.cl-3.5</url>
</location>
<part>
<date>2022-09</date>
<detail type="volume"><number>48</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>635</start>
<end>672</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Investigating Language Relationships in Multilingual Sentence Encoders Through the Lens of Linguistic Typology
%A Choenni, Rochelle
%A Shutova, Ekaterina
%J Computational Linguistics
%D 2022
%8 September
%V 48
%N 3
%I MIT Press
%C Cambridge, MA
%F choenni-shutova-2022-investigating
%X Multilingual sentence encoders have seen much success in cross-lingual model transfer for downstream NLP tasks. The success of this transfer is, however, dependent on the model’s ability to encode the patterns of cross-lingual similarity and variation. Yet, we know relatively little about the properties of individual languages or the general patterns of linguistic variation that the models encode. In this article, we investigate these questions by leveraging knowledge from the field of linguistic typology, which studies and documents structural and semantic variation across languages. We propose methods for separating language-specific subspaces within state-of-the-art multilingual sentence encoders (LASER, M-BERT, XLM, and XLM-R) with respect to a range of typological properties pertaining to lexical, morphological, and syntactic structure. Moreover, we investigate how typological information about languages is distributed across all layers of the models. Our results show interesting differences in encoding linguistic variation associated with different pretraining strategies. In addition, we propose a simple method to study how shared typological properties of languages are encoded in two state-of-the-art multilingual models—M-BERT and XLM-R. The results provide insight into their information-sharing mechanisms and suggest that these linguistic properties are encoded jointly across typologically similar languages in these models.
%R 10.1162/coli_a_00444
%U https://aclanthology.org/2022.cl-3.5
%U https://doi.org/10.1162/coli_a_00444
%P 635-672
Markdown (Informal)
[Investigating Language Relationships in Multilingual Sentence Encoders Through the Lens of Linguistic Typology](https://aclanthology.org/2022.cl-3.5) (Choenni & Shutova, CL 2022)
ACL