@inproceedings{clouatre-etal-2022-detecting,
title = "Detecting Languages Unintelligible to Multilingual Models through Local Structure Probes",
author = "Clouatre, Louis and
Parthasarathi, Prasanna and
Zouaq, Amal and
Chandar, Sarath",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.393",
doi = "10.18653/v1/2022.findings-emnlp.393",
pages = "5375--5396",
abstract = "Providing better language tools for low-resource and endangered languages is imperative for equitable growth.Recent progress with massively multilingual pretrained models has proven surprisingly effective at performing zero-shot transfer to a wide variety of languages.However, this transfer is not universal, with many languages not currently understood by multilingual approaches.It is estimated that only 72 languages possess a {``}small set of labeled datasets{''} on which we could test a model{'}s performance, the vast majority of languages not having the resources available to simply evaluate performances on.In this work, we attempt to clarify which languages do and do not currently benefit from such transfer.To that end, we develop a general approach that requires only unlabelled text to detect which languages are not well understood by a cross-lingual model.Our approach is derived from the hypothesis that if a model{'}s understanding is insensitive to perturbations to text in a language, it is likely to have a limited understanding of that language.We construct a cross-lingual sentence similarity task to evaluate our approach empirically on 350, primarily low-resource, languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="clouatre-etal-2022-detecting">
<titleInfo>
<title>Detecting Languages Unintelligible to Multilingual Models through Local Structure Probes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Louis</namePart>
<namePart type="family">Clouatre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasanna</namePart>
<namePart type="family">Parthasarathi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amal</namePart>
<namePart type="family">Zouaq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarath</namePart>
<namePart type="family">Chandar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Providing better language tools for low-resource and endangered languages is imperative for equitable growth.Recent progress with massively multilingual pretrained models has proven surprisingly effective at performing zero-shot transfer to a wide variety of languages.However, this transfer is not universal, with many languages not currently understood by multilingual approaches.It is estimated that only 72 languages possess a “small set of labeled datasets” on which we could test a model’s performance, the vast majority of languages not having the resources available to simply evaluate performances on.In this work, we attempt to clarify which languages do and do not currently benefit from such transfer.To that end, we develop a general approach that requires only unlabelled text to detect which languages are not well understood by a cross-lingual model.Our approach is derived from the hypothesis that if a model’s understanding is insensitive to perturbations to text in a language, it is likely to have a limited understanding of that language.We construct a cross-lingual sentence similarity task to evaluate our approach empirically on 350, primarily low-resource, languages.</abstract>
<identifier type="citekey">clouatre-etal-2022-detecting</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.393</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.393</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>5375</start>
<end>5396</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Detecting Languages Unintelligible to Multilingual Models through Local Structure Probes
%A Clouatre, Louis
%A Parthasarathi, Prasanna
%A Zouaq, Amal
%A Chandar, Sarath
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F clouatre-etal-2022-detecting
%X Providing better language tools for low-resource and endangered languages is imperative for equitable growth.Recent progress with massively multilingual pretrained models has proven surprisingly effective at performing zero-shot transfer to a wide variety of languages.However, this transfer is not universal, with many languages not currently understood by multilingual approaches.It is estimated that only 72 languages possess a “small set of labeled datasets” on which we could test a model’s performance, the vast majority of languages not having the resources available to simply evaluate performances on.In this work, we attempt to clarify which languages do and do not currently benefit from such transfer.To that end, we develop a general approach that requires only unlabelled text to detect which languages are not well understood by a cross-lingual model.Our approach is derived from the hypothesis that if a model’s understanding is insensitive to perturbations to text in a language, it is likely to have a limited understanding of that language.We construct a cross-lingual sentence similarity task to evaluate our approach empirically on 350, primarily low-resource, languages.
%R 10.18653/v1/2022.findings-emnlp.393
%U https://aclanthology.org/2022.findings-emnlp.393
%U https://doi.org/10.18653/v1/2022.findings-emnlp.393
%P 5375-5396
Markdown (Informal)
[Detecting Languages Unintelligible to Multilingual Models through Local Structure Probes](https://aclanthology.org/2022.findings-emnlp.393) (Clouatre et al., Findings 2022)
ACL