@inproceedings{camacho-collados-pilehvar-2018-role,
title = "On the Role of Text Preprocessing in Neural Network Architectures: An Evaluation Study on Text Categorization and Sentiment Analysis",
author = "Camacho-Collados, Jose and
Pilehvar, Mohammad Taher",
editor = "Linzen, Tal and
Chrupa{\l}a, Grzegorz and
Alishahi, Afra",
booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-5406",
doi = "10.18653/v1/W18-5406",
pages = "40--46",
abstract = "Text preprocessing is often the first step in the pipeline of a Natural Language Processing (NLP) system, with potential impact in its final performance. Despite its importance, text preprocessing has not received much attention in the deep learning literature. In this paper we investigate the impact of simple text preprocessing decisions (particularly tokenizing, lemmatizing, lowercasing and multiword grouping) on the performance of a standard neural text classifier. We perform an extensive evaluation on standard benchmarks from text categorization and sentiment analysis. While our experiments show that a simple tokenization of input text is generally adequate, they also highlight significant degrees of variability across preprocessing techniques. This reveals the importance of paying attention to this usually-overlooked step in the pipeline, particularly when comparing different models. Finally, our evaluation provides insights into the best preprocessing practices for training word embeddings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="camacho-collados-pilehvar-2018-role">
<titleInfo>
<title>On the Role of Text Preprocessing in Neural Network Architectures: An Evaluation Study on Text Categorization and Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="family">Camacho-Collados</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Chrupała</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text preprocessing is often the first step in the pipeline of a Natural Language Processing (NLP) system, with potential impact in its final performance. Despite its importance, text preprocessing has not received much attention in the deep learning literature. In this paper we investigate the impact of simple text preprocessing decisions (particularly tokenizing, lemmatizing, lowercasing and multiword grouping) on the performance of a standard neural text classifier. We perform an extensive evaluation on standard benchmarks from text categorization and sentiment analysis. While our experiments show that a simple tokenization of input text is generally adequate, they also highlight significant degrees of variability across preprocessing techniques. This reveals the importance of paying attention to this usually-overlooked step in the pipeline, particularly when comparing different models. Finally, our evaluation provides insights into the best preprocessing practices for training word embeddings.</abstract>
<identifier type="citekey">camacho-collados-pilehvar-2018-role</identifier>
<identifier type="doi">10.18653/v1/W18-5406</identifier>
<location>
<url>https://aclanthology.org/W18-5406</url>
</location>
<part>
<date>2018-11</date>
<extent unit="page">
<start>40</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Role of Text Preprocessing in Neural Network Architectures: An Evaluation Study on Text Categorization and Sentiment Analysis
%A Camacho-Collados, Jose
%A Pilehvar, Mohammad Taher
%Y Linzen, Tal
%Y Chrupała, Grzegorz
%Y Alishahi, Afra
%S Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP
%D 2018
%8 November
%I Association for Computational Linguistics
%C Brussels, Belgium
%F camacho-collados-pilehvar-2018-role
%X Text preprocessing is often the first step in the pipeline of a Natural Language Processing (NLP) system, with potential impact in its final performance. Despite its importance, text preprocessing has not received much attention in the deep learning literature. In this paper we investigate the impact of simple text preprocessing decisions (particularly tokenizing, lemmatizing, lowercasing and multiword grouping) on the performance of a standard neural text classifier. We perform an extensive evaluation on standard benchmarks from text categorization and sentiment analysis. While our experiments show that a simple tokenization of input text is generally adequate, they also highlight significant degrees of variability across preprocessing techniques. This reveals the importance of paying attention to this usually-overlooked step in the pipeline, particularly when comparing different models. Finally, our evaluation provides insights into the best preprocessing practices for training word embeddings.
%R 10.18653/v1/W18-5406
%U https://aclanthology.org/W18-5406
%U https://doi.org/10.18653/v1/W18-5406
%P 40-46
Markdown (Informal)
[On the Role of Text Preprocessing in Neural Network Architectures: An Evaluation Study on Text Categorization and Sentiment Analysis](https://aclanthology.org/W18-5406) (Camacho-Collados & Pilehvar, EMNLP 2018)
ACL