@inproceedings{kreek-apostolova-2018-training,
title = "Training and Prediction Data Discrepancies: Challenges of Text Classification with Noisy, Historical Data",
author = "Kreek, R. Andrew and
Apostolova, Emilia",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the 2018 {EMNLP} Workshop W-{NUT}: The 4th Workshop on Noisy User-generated Text",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6114",
doi = "10.18653/v1/W18-6114",
pages = "104--109",
abstract = "Industry datasets used for text classification are rarely created for that purpose. In most cases, the data and target predictions are a by-product of accumulated historical data, typically fraught with noise, present in both the text-based document, as well as in the targeted labels. In this work, we address the question of how well performance metrics computed on noisy, historical data reflect the performance on the intended future machine learning model input. The results demonstrate the utility of dirty training datasets used to build prediction models for cleaner (and different) prediction inputs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kreek-apostolova-2018-training">
<titleInfo>
<title>Training and Prediction Data Discrepancies: Challenges of Text Classification with Noisy, Historical Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="given">Andrew</namePart>
<namePart type="family">Kreek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emilia</namePart>
<namePart type="family">Apostolova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 EMNLP Workshop W-NUT: The 4th Workshop on Noisy User-generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Industry datasets used for text classification are rarely created for that purpose. In most cases, the data and target predictions are a by-product of accumulated historical data, typically fraught with noise, present in both the text-based document, as well as in the targeted labels. In this work, we address the question of how well performance metrics computed on noisy, historical data reflect the performance on the intended future machine learning model input. The results demonstrate the utility of dirty training datasets used to build prediction models for cleaner (and different) prediction inputs.</abstract>
<identifier type="citekey">kreek-apostolova-2018-training</identifier>
<identifier type="doi">10.18653/v1/W18-6114</identifier>
<location>
<url>https://aclanthology.org/W18-6114</url>
</location>
<part>
<date>2018-11</date>
<extent unit="page">
<start>104</start>
<end>109</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training and Prediction Data Discrepancies: Challenges of Text Classification with Noisy, Historical Data
%A Kreek, R. Andrew
%A Apostolova, Emilia
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the 2018 EMNLP Workshop W-NUT: The 4th Workshop on Noisy User-generated Text
%D 2018
%8 November
%I Association for Computational Linguistics
%C Brussels, Belgium
%F kreek-apostolova-2018-training
%X Industry datasets used for text classification are rarely created for that purpose. In most cases, the data and target predictions are a by-product of accumulated historical data, typically fraught with noise, present in both the text-based document, as well as in the targeted labels. In this work, we address the question of how well performance metrics computed on noisy, historical data reflect the performance on the intended future machine learning model input. The results demonstrate the utility of dirty training datasets used to build prediction models for cleaner (and different) prediction inputs.
%R 10.18653/v1/W18-6114
%U https://aclanthology.org/W18-6114
%U https://doi.org/10.18653/v1/W18-6114
%P 104-109
Markdown (Informal)
[Training and Prediction Data Discrepancies: Challenges of Text Classification with Noisy, Historical Data](https://aclanthology.org/W18-6114) (Kreek & Apostolova, WNUT 2018)
ACL