@inproceedings{li-mak-2017-derivation,
title = "Derivation of Document Vectors from Adaptation of {LSTM} Language Model",
author = "Li, Wei and
Mak, Brian",
editor = "Lapata, Mirella and
Blunsom, Phil and
Koller, Alexander",
booktitle = "Proceedings of the 15th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 2, Short Papers",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/E17-2073",
pages = "456--461",
abstract = "In many natural language processing (NLP) tasks, a document is commonly modeled as a bag of words using the term frequency-inverse document frequency (TF-IDF) vector. One major shortcoming of the frequency-based TF-IDF feature vector is that it ignores word orders that carry syntactic and semantic relationships among the words in a document. This paper proposes a novel distributed vector representation of a document, which will be labeled as DV-LSTM, and is derived from the result of adapting a long short-term memory recurrent neural network language model by the document. DV-LSTM is expected to capture some high-level sequential information in the document, which other current document representations fail to do. It was evaluated in document genre classification in the Brown Corpus and the BNC Baby Corpus. The results show that DV-LSTM significantly outperforms TF-IDF vector and paragraph vector (PV-DM) in most cases, and their combinations may further improve the classification performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-mak-2017-derivation">
<titleInfo>
<title>Derivation of Document Vectors from Adaptation of LSTM Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Mak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mirella</namePart>
<namePart type="family">Lapata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phil</namePart>
<namePart type="family">Blunsom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Koller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Valencia, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In many natural language processing (NLP) tasks, a document is commonly modeled as a bag of words using the term frequency-inverse document frequency (TF-IDF) vector. One major shortcoming of the frequency-based TF-IDF feature vector is that it ignores word orders that carry syntactic and semantic relationships among the words in a document. This paper proposes a novel distributed vector representation of a document, which will be labeled as DV-LSTM, and is derived from the result of adapting a long short-term memory recurrent neural network language model by the document. DV-LSTM is expected to capture some high-level sequential information in the document, which other current document representations fail to do. It was evaluated in document genre classification in the Brown Corpus and the BNC Baby Corpus. The results show that DV-LSTM significantly outperforms TF-IDF vector and paragraph vector (PV-DM) in most cases, and their combinations may further improve the classification performance.</abstract>
<identifier type="citekey">li-mak-2017-derivation</identifier>
<location>
<url>https://aclanthology.org/E17-2073</url>
</location>
<part>
<date>2017-04</date>
<extent unit="page">
<start>456</start>
<end>461</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Derivation of Document Vectors from Adaptation of LSTM Language Model
%A Li, Wei
%A Mak, Brian
%Y Lapata, Mirella
%Y Blunsom, Phil
%Y Koller, Alexander
%S Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers
%D 2017
%8 April
%I Association for Computational Linguistics
%C Valencia, Spain
%F li-mak-2017-derivation
%X In many natural language processing (NLP) tasks, a document is commonly modeled as a bag of words using the term frequency-inverse document frequency (TF-IDF) vector. One major shortcoming of the frequency-based TF-IDF feature vector is that it ignores word orders that carry syntactic and semantic relationships among the words in a document. This paper proposes a novel distributed vector representation of a document, which will be labeled as DV-LSTM, and is derived from the result of adapting a long short-term memory recurrent neural network language model by the document. DV-LSTM is expected to capture some high-level sequential information in the document, which other current document representations fail to do. It was evaluated in document genre classification in the Brown Corpus and the BNC Baby Corpus. The results show that DV-LSTM significantly outperforms TF-IDF vector and paragraph vector (PV-DM) in most cases, and their combinations may further improve the classification performance.
%U https://aclanthology.org/E17-2073
%P 456-461
Markdown (Informal)
[Derivation of Document Vectors from Adaptation of LSTM Language Model](https://aclanthology.org/E17-2073) (Li & Mak, EACL 2017)
ACL