@inproceedings{roberts-2016-assessing,
title = "Assessing the Corpus Size vs. Similarity Trade-off for Word Embeddings in Clinical {NLP}",
author = "Roberts, Kirk",
editor = "Rumshisky, Anna and
Roberts, Kirk and
Bethard, Steven and
Naumann, Tristan",
booktitle = "Proceedings of the Clinical Natural Language Processing Workshop ({C}linical{NLP})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4208/",
pages = "54--63",
abstract = "The proliferation of deep learning methods in natural language processing (NLP) and the large amounts of data they often require stands in stark contrast to the relatively data-poor clinical NLP domain. In particular, large text corpora are necessary to build high-quality word embeddings, yet often large corpora that are suitably representative of the target clinical data are unavailable. This forces a choice between building embeddings from small clinical corpora and less representative, larger corpora. This paper explores this trade-off, as well as intermediate compromise solutions. Two standard clinical NLP tasks (the i2b2 2010 concept and assertion tasks) are evaluated with commonly used deep learning models (recurrent neural networks and convolutional neural networks) using a set of six corpora ranging from the target i2b2 data to large open-domain datasets. While combinations of corpora are generally found to work best, the single-best corpus is generally task-dependent."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="roberts-2016-assessing">
<titleInfo>
<title>Assessing the Corpus Size vs. Similarity Trade-off for Word Embeddings in Clinical NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Clinical Natural Language Processing Workshop (ClinicalNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Naumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The proliferation of deep learning methods in natural language processing (NLP) and the large amounts of data they often require stands in stark contrast to the relatively data-poor clinical NLP domain. In particular, large text corpora are necessary to build high-quality word embeddings, yet often large corpora that are suitably representative of the target clinical data are unavailable. This forces a choice between building embeddings from small clinical corpora and less representative, larger corpora. This paper explores this trade-off, as well as intermediate compromise solutions. Two standard clinical NLP tasks (the i2b2 2010 concept and assertion tasks) are evaluated with commonly used deep learning models (recurrent neural networks and convolutional neural networks) using a set of six corpora ranging from the target i2b2 data to large open-domain datasets. While combinations of corpora are generally found to work best, the single-best corpus is generally task-dependent.</abstract>
<identifier type="citekey">roberts-2016-assessing</identifier>
<location>
<url>https://aclanthology.org/W16-4208/</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>54</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing the Corpus Size vs. Similarity Trade-off for Word Embeddings in Clinical NLP
%A Roberts, Kirk
%Y Rumshisky, Anna
%Y Roberts, Kirk
%Y Bethard, Steven
%Y Naumann, Tristan
%S Proceedings of the Clinical Natural Language Processing Workshop (ClinicalNLP)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F roberts-2016-assessing
%X The proliferation of deep learning methods in natural language processing (NLP) and the large amounts of data they often require stands in stark contrast to the relatively data-poor clinical NLP domain. In particular, large text corpora are necessary to build high-quality word embeddings, yet often large corpora that are suitably representative of the target clinical data are unavailable. This forces a choice between building embeddings from small clinical corpora and less representative, larger corpora. This paper explores this trade-off, as well as intermediate compromise solutions. Two standard clinical NLP tasks (the i2b2 2010 concept and assertion tasks) are evaluated with commonly used deep learning models (recurrent neural networks and convolutional neural networks) using a set of six corpora ranging from the target i2b2 data to large open-domain datasets. While combinations of corpora are generally found to work best, the single-best corpus is generally task-dependent.
%U https://aclanthology.org/W16-4208/
%P 54-63
Markdown (Informal)
[Assessing the Corpus Size vs. Similarity Trade-off for Word Embeddings in Clinical NLP](https://aclanthology.org/W16-4208/) (Roberts, ClinicalNLP 2016)
ACL