@inproceedings{moosa-etal-2023-transliteration,
title = "Does Transliteration Help Multilingual Language Modeling?",
author = "Moosa, Ibraheem Muhammad and
Akhter, Mahmud Elahi and
Habib, Ashfia Binte",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.50",
doi = "10.18653/v1/2023.findings-eacl.50",
pages = "670--685",
abstract = "Script diversity presents a challenge to Multilingual Language Models (MLLM) by reducing lexical overlap among closely related languages. Therefore, transliterating closely related languages that use different writing scripts to a common script may improve the downstream task performance of MLLMs. We empirically measure the effect of transliteration on MLLMs in this context. We specifically focus on the Indic languages, which have the highest script diversity in the world, and we evaluate our models on the IndicGLUE benchmark. We perform the Mann-Whitney U test to rigorously verify whether the effect of transliteration is significant or not. We find that transliteration benefits the low-resource languages without negatively affecting the comparatively high-resource languages. We also measure the cross-lingual representation similarity of the models using centered kernel alignment on parallel sentences from the FLORES-101 dataset. We find that for parallel sentences across different languages, the transliteration-based model learns sentence representations that are more similar.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moosa-etal-2023-transliteration">
<titleInfo>
<title>Does Transliteration Help Multilingual Language Modeling?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ibraheem</namePart>
<namePart type="given">Muhammad</namePart>
<namePart type="family">Moosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahmud</namePart>
<namePart type="given">Elahi</namePart>
<namePart type="family">Akhter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashfia</namePart>
<namePart type="given">Binte</namePart>
<namePart type="family">Habib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Script diversity presents a challenge to Multilingual Language Models (MLLM) by reducing lexical overlap among closely related languages. Therefore, transliterating closely related languages that use different writing scripts to a common script may improve the downstream task performance of MLLMs. We empirically measure the effect of transliteration on MLLMs in this context. We specifically focus on the Indic languages, which have the highest script diversity in the world, and we evaluate our models on the IndicGLUE benchmark. We perform the Mann-Whitney U test to rigorously verify whether the effect of transliteration is significant or not. We find that transliteration benefits the low-resource languages without negatively affecting the comparatively high-resource languages. We also measure the cross-lingual representation similarity of the models using centered kernel alignment on parallel sentences from the FLORES-101 dataset. We find that for parallel sentences across different languages, the transliteration-based model learns sentence representations that are more similar.</abstract>
<identifier type="citekey">moosa-etal-2023-transliteration</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.50</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.50</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>670</start>
<end>685</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Transliteration Help Multilingual Language Modeling?
%A Moosa, Ibraheem Muhammad
%A Akhter, Mahmud Elahi
%A Habib, Ashfia Binte
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F moosa-etal-2023-transliteration
%X Script diversity presents a challenge to Multilingual Language Models (MLLM) by reducing lexical overlap among closely related languages. Therefore, transliterating closely related languages that use different writing scripts to a common script may improve the downstream task performance of MLLMs. We empirically measure the effect of transliteration on MLLMs in this context. We specifically focus on the Indic languages, which have the highest script diversity in the world, and we evaluate our models on the IndicGLUE benchmark. We perform the Mann-Whitney U test to rigorously verify whether the effect of transliteration is significant or not. We find that transliteration benefits the low-resource languages without negatively affecting the comparatively high-resource languages. We also measure the cross-lingual representation similarity of the models using centered kernel alignment on parallel sentences from the FLORES-101 dataset. We find that for parallel sentences across different languages, the transliteration-based model learns sentence representations that are more similar.
%R 10.18653/v1/2023.findings-eacl.50
%U https://aclanthology.org/2023.findings-eacl.50
%U https://doi.org/10.18653/v1/2023.findings-eacl.50
%P 670-685
Markdown (Informal)
[Does Transliteration Help Multilingual Language Modeling?](https://aclanthology.org/2023.findings-eacl.50) (Moosa et al., Findings 2023)
ACL
- Ibraheem Muhammad Moosa, Mahmud Elahi Akhter, and Ashfia Binte Habib. 2023. Does Transliteration Help Multilingual Language Modeling?. In Findings of the Association for Computational Linguistics: EACL 2023, pages 670–685, Dubrovnik, Croatia. Association for Computational Linguistics.