@InProceedings{costabertaglia-volpenunes:2016:WNUT,
  author    = {Costa Bertaglia, Thales Felipe  and  Volpe Nunes, Maria das Gra\c{c}as},
  title     = {Exploring Word Embeddings for Unsupervised Textual User-Generated Content Normalization},
  booktitle = {Proceedings of the 2nd Workshop on Noisy User-generated Text (WNUT)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {112--120},
  abstract  = {Text normalization techniques based on rules, lexicons or supervised training
	requiring large
	corpora are not scalable nor domain interchangeable, and this makes them
	unsuitable for normal-
	izing user-generated content (UGC). Current tools available for Brazilian
	Portuguese make use
	of such techniques. In this work we propose a technique based on distributed
	representation of
	words (or word embeddings). It generates continuous numeric vectors of
	high-dimensionality to
	represent words. The vectors explicitly encode many linguistic regularities and
	patterns, as well
	as syntactic and semantic word relationships. Words that share semantic
	similarity are repre-
	sented by similar vectors. Based on these features, we present a totally
	unsupervised, expandable
	and language and domain independent method for learning normalization lexicons
	from word
	embeddings. Our approach obtains high correction rate of orthographic errors
	and internet slang
	in product reviews, outperforming the current available tools for Brazilian
	Portuguese.},
  url       = {http://aclweb.org/anthology/W16-3916}
}

