@article{TACL1379,
	author = {Gerz, Daniela  and Vulić, Ivan  and Ponti, Edoardo Maria and Naradowsky, Jason  and Reichart, Roi  and Korhonen, Anna },
	title = {Language Modeling for Morphologically Rich Languages: Character-Aware Modeling for Word-Level Prediction},
	journal = {Transactions of the Association for Computational Linguistics},
	volume = {6},
	year = {2018},
	keywords = {},
	abstract = {Neural architectures are prominent in the construction of language models (LMs). However, word-level prediction is typically agnostic of subword-level information (characters and character sequences) and operates over a closed vocabulary, consisting of a limited word set. Indeed, while subword-aware models boost performance across a variety of NLP tasks, previous work did not evaluate the ability of these models to assist next-word prediction in language modeling tasks. Such subword-level informed models should be particularly effective for morphologically-rich languages (MRLs) that exhibit high type-to-token ratios. In this work, we present a large-scale LM study on 50 typologically diverse languages covering a wide variety of morphological systems, and offer new LM benchmarks to the community, while considering subword-level information. The main technical contribution of our work is a novel method for injecting subword-level information into semantic word vectors, integrated into the neural language modeling training, to facilitate word-level prediction. We conduct experiments in the LM setting where the number of infrequent words is large, and demonstrate strong perplexity gains across our 50 languages, especially for morphologically-rich languages. Our code and data sets are publicly available.},
	issn = {2307-387X},
	url = {https://www.transacl.org/ojs/index.php/tacl/article/view/1379},
	pages = {451--465}
}
