@inproceedings{fernandez-downey-2018-sampling,
title = "Sampling Informative Training Data for {RNN} Language Models",
author = "Fernandez, Jared and
Downey, Doug",
editor = "Shwartz, Vered and
Tabassum, Jeniya and
Voigt, Rob and
Che, Wanxiang and
de Marneffe, Marie-Catherine and
Nissim, Malvina",
booktitle = "Proceedings of {ACL} 2018, Student Research Workshop",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P18-3002",
doi = "10.18653/v1/P18-3002",
pages = "9--13",
abstract = "We propose an unsupervised importance sampling approach to selecting training data for recurrent neural network (RNNs) language models. To increase the information content of the training set, our approach preferentially samples high perplexity sentences, as determined by an easily queryable n-gram language model. We experimentally evaluate the heldout perplexity of models trained with our various importance sampling distributions. We show that language models trained on data sampled using our proposed approach outperform models trained over randomly sampled subsets of both the Billion Word (Chelba et al., 2014 Wikitext-103 benchmark corpora (Merity et al., 2016).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fernandez-downey-2018-sampling">
<titleInfo>
<title>Sampling Informative Training Data for RNN Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jared</namePart>
<namePart type="family">Fernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Doug</namePart>
<namePart type="family">Downey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of ACL 2018, Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeniya</namePart>
<namePart type="family">Tabassum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Voigt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malvina</namePart>
<namePart type="family">Nissim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose an unsupervised importance sampling approach to selecting training data for recurrent neural network (RNNs) language models. To increase the information content of the training set, our approach preferentially samples high perplexity sentences, as determined by an easily queryable n-gram language model. We experimentally evaluate the heldout perplexity of models trained with our various importance sampling distributions. We show that language models trained on data sampled using our proposed approach outperform models trained over randomly sampled subsets of both the Billion Word (Chelba et al., 2014 Wikitext-103 benchmark corpora (Merity et al., 2016).</abstract>
<identifier type="citekey">fernandez-downey-2018-sampling</identifier>
<identifier type="doi">10.18653/v1/P18-3002</identifier>
<location>
<url>https://aclanthology.org/P18-3002</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>9</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sampling Informative Training Data for RNN Language Models
%A Fernandez, Jared
%A Downey, Doug
%Y Shwartz, Vered
%Y Tabassum, Jeniya
%Y Voigt, Rob
%Y Che, Wanxiang
%Y de Marneffe, Marie-Catherine
%Y Nissim, Malvina
%S Proceedings of ACL 2018, Student Research Workshop
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F fernandez-downey-2018-sampling
%X We propose an unsupervised importance sampling approach to selecting training data for recurrent neural network (RNNs) language models. To increase the information content of the training set, our approach preferentially samples high perplexity sentences, as determined by an easily queryable n-gram language model. We experimentally evaluate the heldout perplexity of models trained with our various importance sampling distributions. We show that language models trained on data sampled using our proposed approach outperform models trained over randomly sampled subsets of both the Billion Word (Chelba et al., 2014 Wikitext-103 benchmark corpora (Merity et al., 2016).
%R 10.18653/v1/P18-3002
%U https://aclanthology.org/P18-3002
%U https://doi.org/10.18653/v1/P18-3002
%P 9-13
Markdown (Informal)
[Sampling Informative Training Data for RNN Language Models](https://aclanthology.org/P18-3002) (Fernandez & Downey, ACL 2018)
ACL