@article{shareghi-etal-2016-fast,
title = "Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees",
author = "Shareghi, Ehsan and
Petri, Matthias and
Haffari, Gholamreza and
Cohn, Trevor",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina",
journal = "Transactions of the Association for Computational Linguistics",
volume = "4",
year = "2016",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q16-1034/",
doi = "10.1162/tacl_a_00112",
pages = "477--490",
abstract = "Efficient methods for storing and querying are critical for scaling high-order m-gram language models to large corpora. We propose a language model based on compressed suffix trees, a representation that is highly compact and can be easily held in memory, while supporting queries needed in computing language model probabilities on-the-fly. We present several optimisations which improve query runtimes up to 2500{\texttimes}, despite only incurring a modest increase in construction time and memory usage. For large corpora and high Markov orders, our method is highly competitive with the state-of-the-art KenLM package. It imposes much lower memory requirements, often by orders of magnitude, and has runtimes that are either similar (for training) or comparable (for querying)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shareghi-etal-2016-fast">
<titleInfo>
<title>Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Shareghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Petri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gholamreza</namePart>
<namePart type="family">Haffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Efficient methods for storing and querying are critical for scaling high-order m-gram language models to large corpora. We propose a language model based on compressed suffix trees, a representation that is highly compact and can be easily held in memory, while supporting queries needed in computing language model probabilities on-the-fly. We present several optimisations which improve query runtimes up to 2500×, despite only incurring a modest increase in construction time and memory usage. For large corpora and high Markov orders, our method is highly competitive with the state-of-the-art KenLM package. It imposes much lower memory requirements, often by orders of magnitude, and has runtimes that are either similar (for training) or comparable (for querying).</abstract>
<identifier type="citekey">shareghi-etal-2016-fast</identifier>
<identifier type="doi">10.1162/tacl_a_00112</identifier>
<location>
<url>https://aclanthology.org/Q16-1034/</url>
</location>
<part>
<date>2016</date>
<detail type="volume"><number>4</number></detail>
<extent unit="page">
<start>477</start>
<end>490</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees
%A Shareghi, Ehsan
%A Petri, Matthias
%A Haffari, Gholamreza
%A Cohn, Trevor
%J Transactions of the Association for Computational Linguistics
%D 2016
%V 4
%I MIT Press
%C Cambridge, MA
%F shareghi-etal-2016-fast
%X Efficient methods for storing and querying are critical for scaling high-order m-gram language models to large corpora. We propose a language model based on compressed suffix trees, a representation that is highly compact and can be easily held in memory, while supporting queries needed in computing language model probabilities on-the-fly. We present several optimisations which improve query runtimes up to 2500×, despite only incurring a modest increase in construction time and memory usage. For large corpora and high Markov orders, our method is highly competitive with the state-of-the-art KenLM package. It imposes much lower memory requirements, often by orders of magnitude, and has runtimes that are either similar (for training) or comparable (for querying).
%R 10.1162/tacl_a_00112
%U https://aclanthology.org/Q16-1034/
%U https://doi.org/10.1162/tacl_a_00112
%P 477-490
Markdown (Informal)
[Fast, Small and Exact: Infinite-order Language Modelling with Compressed Suffix Trees](https://aclanthology.org/Q16-1034/) (Shareghi et al., TACL 2016)
ACL