@inproceedings{cognetta-etal-2019-compression,
title = "On the Compression of Lexicon Transducers",
author = "Cognetta, Marco and
Allauzen, Cyril and
Riley, Michael",
editor = "Vogler, Heiko and
Maletti, Andreas",
booktitle = "Proceedings of the 14th International Conference on Finite-State Methods and Natural Language Processing",
month = sep,
year = "2019",
address = "Dresden, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-3105",
doi = "10.18653/v1/W19-3105",
pages = "18--26",
abstract = "In finite-state language processing pipelines, a lexicon is often a key component. It needs to be comprehensive to ensure accuracy, reducing out-of-vocabulary misses. However, in memory-constrained environments (e.g., mobile phones), the size of the component automata must be kept small. Indeed, a delicate balance between comprehensiveness, speed, and memory must be struck to conform to device requirements while providing a good user experience.In this paper, we describe a compression scheme for lexicons when represented as finite-state transducers. We efficiently encode the graph of the transducer while storing transition labels separately. The graph encoding scheme is based on the LOUDS (Level Order Unary Degree Sequence) tree representation, which has constant time tree traversal for queries while being information-theoretically optimal in space. We find that our encoding is near the theoretical lower bound for such graphs and substantially outperforms more traditional representations in space while remaining competitive in latency benchmarks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cognetta-etal-2019-compression">
<titleInfo>
<title>On the Compression of Lexicon Transducers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Cognetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Allauzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Riley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Finite-State Methods and Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heiko</namePart>
<namePart type="family">Vogler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Maletti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dresden, Germany</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In finite-state language processing pipelines, a lexicon is often a key component. It needs to be comprehensive to ensure accuracy, reducing out-of-vocabulary misses. However, in memory-constrained environments (e.g., mobile phones), the size of the component automata must be kept small. Indeed, a delicate balance between comprehensiveness, speed, and memory must be struck to conform to device requirements while providing a good user experience.In this paper, we describe a compression scheme for lexicons when represented as finite-state transducers. We efficiently encode the graph of the transducer while storing transition labels separately. The graph encoding scheme is based on the LOUDS (Level Order Unary Degree Sequence) tree representation, which has constant time tree traversal for queries while being information-theoretically optimal in space. We find that our encoding is near the theoretical lower bound for such graphs and substantially outperforms more traditional representations in space while remaining competitive in latency benchmarks.</abstract>
<identifier type="citekey">cognetta-etal-2019-compression</identifier>
<identifier type="doi">10.18653/v1/W19-3105</identifier>
<location>
<url>https://aclanthology.org/W19-3105</url>
</location>
<part>
<date>2019-09</date>
<extent unit="page">
<start>18</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Compression of Lexicon Transducers
%A Cognetta, Marco
%A Allauzen, Cyril
%A Riley, Michael
%Y Vogler, Heiko
%Y Maletti, Andreas
%S Proceedings of the 14th International Conference on Finite-State Methods and Natural Language Processing
%D 2019
%8 September
%I Association for Computational Linguistics
%C Dresden, Germany
%F cognetta-etal-2019-compression
%X In finite-state language processing pipelines, a lexicon is often a key component. It needs to be comprehensive to ensure accuracy, reducing out-of-vocabulary misses. However, in memory-constrained environments (e.g., mobile phones), the size of the component automata must be kept small. Indeed, a delicate balance between comprehensiveness, speed, and memory must be struck to conform to device requirements while providing a good user experience.In this paper, we describe a compression scheme for lexicons when represented as finite-state transducers. We efficiently encode the graph of the transducer while storing transition labels separately. The graph encoding scheme is based on the LOUDS (Level Order Unary Degree Sequence) tree representation, which has constant time tree traversal for queries while being information-theoretically optimal in space. We find that our encoding is near the theoretical lower bound for such graphs and substantially outperforms more traditional representations in space while remaining competitive in latency benchmarks.
%R 10.18653/v1/W19-3105
%U https://aclanthology.org/W19-3105
%U https://doi.org/10.18653/v1/W19-3105
%P 18-26
Markdown (Informal)
[On the Compression of Lexicon Transducers](https://aclanthology.org/W19-3105) (Cognetta et al., FSMNLP 2019)
ACL
- Marco Cognetta, Cyril Allauzen, and Michael Riley. 2019. On the Compression of Lexicon Transducers. In Proceedings of the 14th International Conference on Finite-State Methods and Natural Language Processing, pages 18–26, Dresden, Germany. Association for Computational Linguistics.