@inproceedings{brahma-2019-improved,
title = "Improved Language Modeling by Decoding the Past",
author = "Brahma, Siddhartha",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'\i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1142",
doi = "10.18653/v1/P19-1142",
pages = "1468--1476",
abstract = "Highly regularized LSTMs achieve impressive results on several benchmark datasets in language modeling. We propose a new regularization method based on decoding the last token in the context using the predicted distribution of the next token. This biases the model towards retaining more contextual information, in turn improving its ability to predict the next token. With negligible overhead in the number of parameters and training time, our Past Decode Regularization (PDR) method improves perplexity on the Penn Treebank dataset by up to 1.8 points and by up to 2.3 points on the WikiText-2 dataset, over strong regularized baselines using a single softmax. With a mixture-of-softmax model, we show gains of up to 1.0 perplexity points on these datasets. In addition, our method achieves 1.169 bits-per-character on the Penn Treebank Character dataset for character level language modeling.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brahma-2019-improved">
<titleInfo>
<title>Improved Language Modeling by Decoding the Past</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siddhartha</namePart>
<namePart type="family">Brahma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Highly regularized LSTMs achieve impressive results on several benchmark datasets in language modeling. We propose a new regularization method based on decoding the last token in the context using the predicted distribution of the next token. This biases the model towards retaining more contextual information, in turn improving its ability to predict the next token. With negligible overhead in the number of parameters and training time, our Past Decode Regularization (PDR) method improves perplexity on the Penn Treebank dataset by up to 1.8 points and by up to 2.3 points on the WikiText-2 dataset, over strong regularized baselines using a single softmax. With a mixture-of-softmax model, we show gains of up to 1.0 perplexity points on these datasets. In addition, our method achieves 1.169 bits-per-character on the Penn Treebank Character dataset for character level language modeling.</abstract>
<identifier type="citekey">brahma-2019-improved</identifier>
<identifier type="doi">10.18653/v1/P19-1142</identifier>
<location>
<url>https://aclanthology.org/P19-1142</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>1468</start>
<end>1476</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improved Language Modeling by Decoding the Past
%A Brahma, Siddhartha
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F brahma-2019-improved
%X Highly regularized LSTMs achieve impressive results on several benchmark datasets in language modeling. We propose a new regularization method based on decoding the last token in the context using the predicted distribution of the next token. This biases the model towards retaining more contextual information, in turn improving its ability to predict the next token. With negligible overhead in the number of parameters and training time, our Past Decode Regularization (PDR) method improves perplexity on the Penn Treebank dataset by up to 1.8 points and by up to 2.3 points on the WikiText-2 dataset, over strong regularized baselines using a single softmax. With a mixture-of-softmax model, we show gains of up to 1.0 perplexity points on these datasets. In addition, our method achieves 1.169 bits-per-character on the Penn Treebank Character dataset for character level language modeling.
%R 10.18653/v1/P19-1142
%U https://aclanthology.org/P19-1142
%U https://doi.org/10.18653/v1/P19-1142
%P 1468-1476
Markdown (Informal)
[Improved Language Modeling by Decoding the Past](https://aclanthology.org/P19-1142) (Brahma, ACL 2019)
ACL
- Siddhartha Brahma. 2019. Improved Language Modeling by Decoding the Past. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 1468–1476, Florence, Italy. Association for Computational Linguistics.