@inproceedings{sabty-etal-2020-contextual,
title = "Contextual Embeddings for {A}rabic-{E}nglish Code-Switched Data",
author = "Sabty, Caroline and
Islam, Mohamed and
Abdennadher, Slim",
editor = "Zitouni, Imed and
Abdul-Mageed, Muhammad and
Bouamor, Houda and
Bougares, Fethi and
El-Haj, Mahmoud and
Tomeh, Nadi and
Zaghouani, Wajdi",
booktitle = "Proceedings of the Fifth Arabic Natural Language Processing Workshop",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wanlp-1.20/",
pages = "215--225",
abstract = "Globalization has caused the rise of the code-switching phenomenon among multilingual societies. In Arab countries, code-switching between Arabic and English has become frequent, especially through social media platforms. Consequently, research in Natural Language Processing (NLP) systems increased to tackle such a phenomenon. One of the significant challenges of developing code-switched NLP systems is the lack of data itself. In this paper, we propose an open source trained bilingual contextual word embedding models of FLAIR, BERT, and ELECTRA. We also propose a novel contextual word embedding model called KERMIT, which can efficiently map Arabic and English words inside one vector space in terms of data usage. We applied intrinsic and extrinsic evaluation methods to compare the performance of the models. Our results show that FLAIR and FastText achieve the highest results in the sentiment analysis task. However, KERMIT is the best-achieving model on the intrinsic evaluation and named entity recognition. Also, it outperforms the other transformer-based models on question answering task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sabty-etal-2020-contextual">
<titleInfo>
<title>Contextual Embeddings for Arabic-English Code-Switched Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Caroline</namePart>
<namePart type="family">Sabty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slim</namePart>
<namePart type="family">Abdennadher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Arabic Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fethi</namePart>
<namePart type="family">Bougares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahmoud</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Globalization has caused the rise of the code-switching phenomenon among multilingual societies. In Arab countries, code-switching between Arabic and English has become frequent, especially through social media platforms. Consequently, research in Natural Language Processing (NLP) systems increased to tackle such a phenomenon. One of the significant challenges of developing code-switched NLP systems is the lack of data itself. In this paper, we propose an open source trained bilingual contextual word embedding models of FLAIR, BERT, and ELECTRA. We also propose a novel contextual word embedding model called KERMIT, which can efficiently map Arabic and English words inside one vector space in terms of data usage. We applied intrinsic and extrinsic evaluation methods to compare the performance of the models. Our results show that FLAIR and FastText achieve the highest results in the sentiment analysis task. However, KERMIT is the best-achieving model on the intrinsic evaluation and named entity recognition. Also, it outperforms the other transformer-based models on question answering task.</abstract>
<identifier type="citekey">sabty-etal-2020-contextual</identifier>
<location>
<url>https://aclanthology.org/2020.wanlp-1.20/</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>215</start>
<end>225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Contextual Embeddings for Arabic-English Code-Switched Data
%A Sabty, Caroline
%A Islam, Mohamed
%A Abdennadher, Slim
%Y Zitouni, Imed
%Y Abdul-Mageed, Muhammad
%Y Bouamor, Houda
%Y Bougares, Fethi
%Y El-Haj, Mahmoud
%Y Tomeh, Nadi
%Y Zaghouani, Wajdi
%S Proceedings of the Fifth Arabic Natural Language Processing Workshop
%D 2020
%8 December
%I Association for Computational Linguistics
%C Barcelona, Spain (Online)
%F sabty-etal-2020-contextual
%X Globalization has caused the rise of the code-switching phenomenon among multilingual societies. In Arab countries, code-switching between Arabic and English has become frequent, especially through social media platforms. Consequently, research in Natural Language Processing (NLP) systems increased to tackle such a phenomenon. One of the significant challenges of developing code-switched NLP systems is the lack of data itself. In this paper, we propose an open source trained bilingual contextual word embedding models of FLAIR, BERT, and ELECTRA. We also propose a novel contextual word embedding model called KERMIT, which can efficiently map Arabic and English words inside one vector space in terms of data usage. We applied intrinsic and extrinsic evaluation methods to compare the performance of the models. Our results show that FLAIR and FastText achieve the highest results in the sentiment analysis task. However, KERMIT is the best-achieving model on the intrinsic evaluation and named entity recognition. Also, it outperforms the other transformer-based models on question answering task.
%U https://aclanthology.org/2020.wanlp-1.20/
%P 215-225
Markdown (Informal)
[Contextual Embeddings for Arabic-English Code-Switched Data](https://aclanthology.org/2020.wanlp-1.20/) (Sabty et al., WANLP 2020)
ACL