@inproceedings{adouane-etal-2018-comparison,
title = "A Comparison of Character Neural Language Model and Bootstrapping for Language Identification in Multilingual Noisy Texts",
author = "Adouane, Wafia and
Dobnik, Simon and
Bernardy, Jean-Philippe and
Semmar, Nasredine",
editor = {Faruqui, Manaal and
Sch{\"u}tze, Hinrich and
Trancoso, Isabel and
Tsvetkov, Yulia and
Yaghoobzadeh, Yadollah},
booktitle = "Proceedings of the Second Workshop on Subword/Character {LE}vel Models",
month = jun,
year = "2018",
address = "New Orleans",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-1203",
doi = "10.18653/v1/W18-1203",
pages = "22--31",
abstract = "This paper seeks to examine the effect of including background knowledge in the form of character pre-trained neural language model (LM), and data bootstrapping to overcome the problem of unbalanced limited resources. As a test, we explore the task of language identification in mixed-language short non-edited texts with an under-resourced language, namely the case of Algerian Arabic for which both labelled and unlabelled data are limited. We compare the performance of two traditional machine learning methods and a deep neural networks (DNNs) model. The results show that overall DNNs perform better on labelled data for the majority categories and struggle with the minority ones. While the effect of the untokenised and unlabelled data encoded as LM differs for each category, bootstrapping, however, improves the performance of all systems and all categories. These methods are language independent and could be generalised to other under-resourced languages for which a small labelled data and a larger unlabelled data are available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="adouane-etal-2018-comparison">
<titleInfo>
<title>A Comparison of Character Neural Language Model and Bootstrapping for Language Identification in Multilingual Noisy Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wafia</namePart>
<namePart type="family">Adouane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Dobnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Philippe</namePart>
<namePart type="family">Bernardy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nasredine</namePart>
<namePart type="family">Semmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Subword/Character LEvel Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manaal</namePart>
<namePart type="family">Faruqui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hinrich</namePart>
<namePart type="family">Schütze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabel</namePart>
<namePart type="family">Trancoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulia</namePart>
<namePart type="family">Tsvetkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yadollah</namePart>
<namePart type="family">Yaghoobzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Orleans</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper seeks to examine the effect of including background knowledge in the form of character pre-trained neural language model (LM), and data bootstrapping to overcome the problem of unbalanced limited resources. As a test, we explore the task of language identification in mixed-language short non-edited texts with an under-resourced language, namely the case of Algerian Arabic for which both labelled and unlabelled data are limited. We compare the performance of two traditional machine learning methods and a deep neural networks (DNNs) model. The results show that overall DNNs perform better on labelled data for the majority categories and struggle with the minority ones. While the effect of the untokenised and unlabelled data encoded as LM differs for each category, bootstrapping, however, improves the performance of all systems and all categories. These methods are language independent and could be generalised to other under-resourced languages for which a small labelled data and a larger unlabelled data are available.</abstract>
<identifier type="citekey">adouane-etal-2018-comparison</identifier>
<identifier type="doi">10.18653/v1/W18-1203</identifier>
<location>
<url>https://aclanthology.org/W18-1203</url>
</location>
<part>
<date>2018-06</date>
<extent unit="page">
<start>22</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Comparison of Character Neural Language Model and Bootstrapping for Language Identification in Multilingual Noisy Texts
%A Adouane, Wafia
%A Dobnik, Simon
%A Bernardy, Jean-Philippe
%A Semmar, Nasredine
%Y Faruqui, Manaal
%Y Schütze, Hinrich
%Y Trancoso, Isabel
%Y Tsvetkov, Yulia
%Y Yaghoobzadeh, Yadollah
%S Proceedings of the Second Workshop on Subword/Character LEvel Models
%D 2018
%8 June
%I Association for Computational Linguistics
%C New Orleans
%F adouane-etal-2018-comparison
%X This paper seeks to examine the effect of including background knowledge in the form of character pre-trained neural language model (LM), and data bootstrapping to overcome the problem of unbalanced limited resources. As a test, we explore the task of language identification in mixed-language short non-edited texts with an under-resourced language, namely the case of Algerian Arabic for which both labelled and unlabelled data are limited. We compare the performance of two traditional machine learning methods and a deep neural networks (DNNs) model. The results show that overall DNNs perform better on labelled data for the majority categories and struggle with the minority ones. While the effect of the untokenised and unlabelled data encoded as LM differs for each category, bootstrapping, however, improves the performance of all systems and all categories. These methods are language independent and could be generalised to other under-resourced languages for which a small labelled data and a larger unlabelled data are available.
%R 10.18653/v1/W18-1203
%U https://aclanthology.org/W18-1203
%U https://doi.org/10.18653/v1/W18-1203
%P 22-31
Markdown (Informal)
[A Comparison of Character Neural Language Model and Bootstrapping for Language Identification in Multilingual Noisy Texts](https://aclanthology.org/W18-1203) (Adouane et al., SCLeM 2018)
ACL