@inproceedings{mutuvi-etal-2020-multilingual,
title = "Multilingual Epidemiological Text Classification: A Comparative Study",
author = {Mutuvi, Stephen and
Boros, Emanuela and
Doucet, Antoine and
Jatowt, Adam and
Lejeune, Ga{\"e}l and
Odeo, Moses},
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.543",
doi = "10.18653/v1/2020.coling-main.543",
pages = "6172--6183",
abstract = "In this paper, we approach the multilingual text classification task in the context of the epidemiological field. Multilingual text classification models tend to perform differently across different languages (low- or high-resourced), more particularly when the dataset is highly imbalanced, which is the case for epidemiological datasets. We conduct a comparative study of different machine and deep learning text classification models using a dataset comprising news articles related to epidemic outbreaks from six languages, four low-resourced and two high-resourced, in order to analyze the influence of the nature of the language, the structure of the document, and the size of the data. Our findings indicate that the performance of the models based on fine-tuned language models exceeds by more than 50{\%} the chosen baseline models that include a specialized epidemiological news surveillance system and several machine learning models. Also, low-resource languages are highly influenced not only by the typology of the languages on which the models have been pre-trained or/and fine-tuned but also by their size. Furthermore, we discover that the beginning and the end of documents provide the most salient features for this task and, as expected, the performance of the models was proportionate to the training data size.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mutuvi-etal-2020-multilingual">
<titleInfo>
<title>Multilingual Epidemiological Text Classification: A Comparative Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Mutuvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emanuela</namePart>
<namePart type="family">Boros</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Doucet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Jatowt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaël</namePart>
<namePart type="family">Lejeune</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moses</namePart>
<namePart type="family">Odeo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we approach the multilingual text classification task in the context of the epidemiological field. Multilingual text classification models tend to perform differently across different languages (low- or high-resourced), more particularly when the dataset is highly imbalanced, which is the case for epidemiological datasets. We conduct a comparative study of different machine and deep learning text classification models using a dataset comprising news articles related to epidemic outbreaks from six languages, four low-resourced and two high-resourced, in order to analyze the influence of the nature of the language, the structure of the document, and the size of the data. Our findings indicate that the performance of the models based on fine-tuned language models exceeds by more than 50% the chosen baseline models that include a specialized epidemiological news surveillance system and several machine learning models. Also, low-resource languages are highly influenced not only by the typology of the languages on which the models have been pre-trained or/and fine-tuned but also by their size. Furthermore, we discover that the beginning and the end of documents provide the most salient features for this task and, as expected, the performance of the models was proportionate to the training data size.</abstract>
<identifier type="citekey">mutuvi-etal-2020-multilingual</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.543</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.543</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>6172</start>
<end>6183</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual Epidemiological Text Classification: A Comparative Study
%A Mutuvi, Stephen
%A Boros, Emanuela
%A Doucet, Antoine
%A Jatowt, Adam
%A Lejeune, Gaël
%A Odeo, Moses
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F mutuvi-etal-2020-multilingual
%X In this paper, we approach the multilingual text classification task in the context of the epidemiological field. Multilingual text classification models tend to perform differently across different languages (low- or high-resourced), more particularly when the dataset is highly imbalanced, which is the case for epidemiological datasets. We conduct a comparative study of different machine and deep learning text classification models using a dataset comprising news articles related to epidemic outbreaks from six languages, four low-resourced and two high-resourced, in order to analyze the influence of the nature of the language, the structure of the document, and the size of the data. Our findings indicate that the performance of the models based on fine-tuned language models exceeds by more than 50% the chosen baseline models that include a specialized epidemiological news surveillance system and several machine learning models. Also, low-resource languages are highly influenced not only by the typology of the languages on which the models have been pre-trained or/and fine-tuned but also by their size. Furthermore, we discover that the beginning and the end of documents provide the most salient features for this task and, as expected, the performance of the models was proportionate to the training data size.
%R 10.18653/v1/2020.coling-main.543
%U https://aclanthology.org/2020.coling-main.543
%U https://doi.org/10.18653/v1/2020.coling-main.543
%P 6172-6183
Markdown (Informal)
[Multilingual Epidemiological Text Classification: A Comparative Study](https://aclanthology.org/2020.coling-main.543) (Mutuvi et al., COLING 2020)
ACL
- Stephen Mutuvi, Emanuela Boros, Antoine Doucet, Adam Jatowt, Gaël Lejeune, and Moses Odeo. 2020. Multilingual Epidemiological Text Classification: A Comparative Study. In Proceedings of the 28th International Conference on Computational Linguistics, pages 6172–6183, Barcelona, Spain (Online). International Committee on Computational Linguistics.