@inproceedings{santini-etal-2019-comparing,
title = "Comparing the Performance of Feature Representations for the Categorization of the Easy-to-Read Variety vs Standard Language",
author = {Santini, Marina and
Danielsson, Benjamin and
J{\"o}nsson, Arne},
editor = "Hartmann, Mareike and
Plank, Barbara",
booktitle = "Proceedings of the 22nd Nordic Conference on Computational Linguistics",
month = sep # "–" # oct,
year = "2019",
address = "Turku, Finland",
publisher = {Link{\"o}ping University Electronic Press},
url = "https://aclanthology.org/W19-6111/",
pages = "105--114",
abstract = "We explore the effectiveness of four feature representations {--} bag-of-words, word embeddings, principal components and autoencoders {--} for the binary categorization of the easy-to-read variety vs standard language. Standard language refers to the ordinary language variety used by a population as a whole or by a community, while the {\textquotedblleft}easy-to-read{\textquotedblright} variety is a simpler (or a simplified) version of the standard language. We test the efficiency of these feature representations on three corpora, which differ in size, class balance, unit of analysis, language and topic. We rely on supervised and unsupervised machine learning algorithms. Results show that bag-of-words is a robust and straightforward feature representation for this task and performs well in many experimental settings. Its performance is equivalent or equal to the performance achieved with principal components and autoencorders, whose preprocessing is however more time-consuming. Word embeddings are less accurate than the other feature representations for this classification task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santini-etal-2019-comparing">
<titleInfo>
<title>Comparing the Performance of Feature Representations for the Categorization of the Easy-to-Read Variety vs Standard Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Santini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Danielsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arne</namePart>
<namePart type="family">Jönsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-sep–oct</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Nordic Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mareike</namePart>
<namePart type="family">Hartmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Plank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Linköping University Electronic Press</publisher>
<place>
<placeTerm type="text">Turku, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We explore the effectiveness of four feature representations – bag-of-words, word embeddings, principal components and autoencoders – for the binary categorization of the easy-to-read variety vs standard language. Standard language refers to the ordinary language variety used by a population as a whole or by a community, while the “easy-to-read” variety is a simpler (or a simplified) version of the standard language. We test the efficiency of these feature representations on three corpora, which differ in size, class balance, unit of analysis, language and topic. We rely on supervised and unsupervised machine learning algorithms. Results show that bag-of-words is a robust and straightforward feature representation for this task and performs well in many experimental settings. Its performance is equivalent or equal to the performance achieved with principal components and autoencorders, whose preprocessing is however more time-consuming. Word embeddings are less accurate than the other feature representations for this classification task.</abstract>
<identifier type="citekey">santini-etal-2019-comparing</identifier>
<location>
<url>https://aclanthology.org/W19-6111/</url>
</location>
<part>
<date>2019-sep–oct</date>
<extent unit="page">
<start>105</start>
<end>114</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing the Performance of Feature Representations for the Categorization of the Easy-to-Read Variety vs Standard Language
%A Santini, Marina
%A Danielsson, Benjamin
%A Jönsson, Arne
%Y Hartmann, Mareike
%Y Plank, Barbara
%S Proceedings of the 22nd Nordic Conference on Computational Linguistics
%D 2019
%8 sep–oct
%I Linköping University Electronic Press
%C Turku, Finland
%F santini-etal-2019-comparing
%X We explore the effectiveness of four feature representations – bag-of-words, word embeddings, principal components and autoencoders – for the binary categorization of the easy-to-read variety vs standard language. Standard language refers to the ordinary language variety used by a population as a whole or by a community, while the “easy-to-read” variety is a simpler (or a simplified) version of the standard language. We test the efficiency of these feature representations on three corpora, which differ in size, class balance, unit of analysis, language and topic. We rely on supervised and unsupervised machine learning algorithms. Results show that bag-of-words is a robust and straightforward feature representation for this task and performs well in many experimental settings. Its performance is equivalent or equal to the performance achieved with principal components and autoencorders, whose preprocessing is however more time-consuming. Word embeddings are less accurate than the other feature representations for this classification task.
%U https://aclanthology.org/W19-6111/
%P 105-114
Markdown (Informal)
[Comparing the Performance of Feature Representations for the Categorization of the Easy-to-Read Variety vs Standard Language](https://aclanthology.org/W19-6111/) (Santini et al., NoDaLiDa 2019)
ACL