@inproceedings{lagus-etal-2019-low,
title = "Low-Rank Approximations of Second-Order Document Representations",
author = "Lagus, Jarkko and
Sinkkonen, Janne and
Klami, Arto",
editor = "Bansal, Mohit and
Villavicencio, Aline",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K19-1059/",
doi = "10.18653/v1/K19-1059",
pages = "634--644",
abstract = "Document embeddings, created with methods ranging from simple heuristics to statistical and deep models, are widely applicable. Bag-of-vectors models for documents include the mean and quadratic approaches (Torki, 2018). We present evidence that quadratic statistics alone, without the mean information, can offer superior accuracy, fast document comparison, and compact document representations. In matching news articles to their comment threads, low-rank representations of only 3-4 times the size of the mean vector give most accurate matching, and in standard sentence comparison tasks, results are state of the art despite faster computation. Similarity measures are discussed, and the Frobenius product implicit in the proposed method is contrasted to Wasserstein or Bures metric from the transportation theory. We also shortly demonstrate matching of unordered word lists to documents, to measure topicality or sentiment of documents."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lagus-etal-2019-low">
<titleInfo>
<title>Low-Rank Approximations of Second-Order Document Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jarkko</namePart>
<namePart type="family">Lagus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janne</namePart>
<namePart type="family">Sinkkonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arto</namePart>
<namePart type="family">Klami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Document embeddings, created with methods ranging from simple heuristics to statistical and deep models, are widely applicable. Bag-of-vectors models for documents include the mean and quadratic approaches (Torki, 2018). We present evidence that quadratic statistics alone, without the mean information, can offer superior accuracy, fast document comparison, and compact document representations. In matching news articles to their comment threads, low-rank representations of only 3-4 times the size of the mean vector give most accurate matching, and in standard sentence comparison tasks, results are state of the art despite faster computation. Similarity measures are discussed, and the Frobenius product implicit in the proposed method is contrasted to Wasserstein or Bures metric from the transportation theory. We also shortly demonstrate matching of unordered word lists to documents, to measure topicality or sentiment of documents.</abstract>
<identifier type="citekey">lagus-etal-2019-low</identifier>
<identifier type="doi">10.18653/v1/K19-1059</identifier>
<location>
<url>https://aclanthology.org/K19-1059/</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>634</start>
<end>644</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-Rank Approximations of Second-Order Document Representations
%A Lagus, Jarkko
%A Sinkkonen, Janne
%A Klami, Arto
%Y Bansal, Mohit
%Y Villavicencio, Aline
%S Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F lagus-etal-2019-low
%X Document embeddings, created with methods ranging from simple heuristics to statistical and deep models, are widely applicable. Bag-of-vectors models for documents include the mean and quadratic approaches (Torki, 2018). We present evidence that quadratic statistics alone, without the mean information, can offer superior accuracy, fast document comparison, and compact document representations. In matching news articles to their comment threads, low-rank representations of only 3-4 times the size of the mean vector give most accurate matching, and in standard sentence comparison tasks, results are state of the art despite faster computation. Similarity measures are discussed, and the Frobenius product implicit in the proposed method is contrasted to Wasserstein or Bures metric from the transportation theory. We also shortly demonstrate matching of unordered word lists to documents, to measure topicality or sentiment of documents.
%R 10.18653/v1/K19-1059
%U https://aclanthology.org/K19-1059/
%U https://doi.org/10.18653/v1/K19-1059
%P 634-644
Markdown (Informal)
[Low-Rank Approximations of Second-Order Document Representations](https://aclanthology.org/K19-1059/) (Lagus et al., CoNLL 2019)
ACL