@InProceedings{broscheit:2018:W18-30,
  author    = {Broscheit, Samuel},
  title     = {Learning Distributional Token Representations from Visual Features},
  booktitle = {Proceedings of The Third Workshop on Representation Learning for NLP},
  month     = {July},
  year      = {2018},
  address   = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  pages     = {187--194},
  abstract  = {In this study, we compare token representations constructed from visual features (i.e., pixels) with standard lookup-based embeddings. Our goal is to gain insight about the challenges of encoding a text representation from low-level features, e.g. from characters or pixels. We focus on Chinese, which---as a logographic language---has properties that make a representation via visual features challenging and interesting. To train and evaluate different models for the token representation, we chose the task of character-based neural machine translation (NMT) from Chinese to English. We found that a token representation computed only from visual features can achieve competitive results to lookup embeddings. However, we also show different strengths and weaknesses in the models' performance in a part-of-speech tagging task and also a semantic similarity task. In summary, we show that it is possible to achieve a \textit{text representation} only from pixels. We hope that this is a useful stepping stone for future studies that exclusively rely on visual input, or aim at exploiting visual features of written language.},
  url       = {http://www.aclweb.org/anthology/W18-3025}
}

