@article{TACL865,
	author = {Shareghi, Ehsan  and Petri, Matthias  and Haffari, Gholamreza  and Cohn, Trevor },
	title = {Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees},
	journal = {Transactions of the Association for Computational Linguistics},
	volume = {4},
	year = {2016},
	keywords = {},
	abstract = {Efficient methods for storing and querying are critical for scaling high-order m-gram language models to large corpora. We propose a language model based on compressed suffix trees, a representation that is highly compact and can be easily held in memory, while supporting queries needed in computing language model probabilities on-the-fly. We present several optimizations which improve query runtimes up to 2500x, despite only incurring a modest increase in construction time and memory usage. For large corpora and high Markov orders, our method is highly competitive with the state-of-the-art KenLM package. It imposes much lower memory requirements, often by orders of magnitude, and has runtimes that are either similar (for training) or comparable (for querying).},
	issn = {2307-387X},
	url = {https://transacl.org/ojs/index.php/tacl/article/view/865},
	pages = {477--490}
}
