@inproceedings{quteineh-etal-2020-textual,
title = "Textual Data Augmentation for Efficient Active Learning on Tiny Datasets",
author = "Quteineh, Husam and
Samothrakis, Spyridon and
Sutcliffe, Richard",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.600",
doi = "10.18653/v1/2020.emnlp-main.600",
pages = "7400--7410",
abstract = "In this paper we propose a novel data augmentation approach where guided outputs of a language generation model, e.g. GPT-2, when labeled, can improve the performance of text classifiers through an active learning process. We transform the data generation task into an optimization problem which maximizes the usefulness of the generated output, using Monte Carlo Tree Search (MCTS) as the optimization strategy and incorporating entropy as one of the optimization criteria. We test our approach against a Non-Guided Data Generation (NGDG) process that does not optimize for a reward function. Starting with a small set of data, our results show an increased performance with MCTS of 26{\%} on the TREC-6 Questions dataset, and 10{\%} on the Stanford Sentiment Treebank SST-2 dataset. Compared with NGDG, we are able to achieve increases of 3{\%} and 5{\%} on TREC-6 and SST-2.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="quteineh-etal-2020-textual">
<titleInfo>
<title>Textual Data Augmentation for Efficient Active Learning on Tiny Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Husam</namePart>
<namePart type="family">Quteineh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spyridon</namePart>
<namePart type="family">Samothrakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sutcliffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we propose a novel data augmentation approach where guided outputs of a language generation model, e.g. GPT-2, when labeled, can improve the performance of text classifiers through an active learning process. We transform the data generation task into an optimization problem which maximizes the usefulness of the generated output, using Monte Carlo Tree Search (MCTS) as the optimization strategy and incorporating entropy as one of the optimization criteria. We test our approach against a Non-Guided Data Generation (NGDG) process that does not optimize for a reward function. Starting with a small set of data, our results show an increased performance with MCTS of 26% on the TREC-6 Questions dataset, and 10% on the Stanford Sentiment Treebank SST-2 dataset. Compared with NGDG, we are able to achieve increases of 3% and 5% on TREC-6 and SST-2.</abstract>
<identifier type="citekey">quteineh-etal-2020-textual</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.600</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.600</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>7400</start>
<end>7410</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Textual Data Augmentation for Efficient Active Learning on Tiny Datasets
%A Quteineh, Husam
%A Samothrakis, Spyridon
%A Sutcliffe, Richard
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F quteineh-etal-2020-textual
%X In this paper we propose a novel data augmentation approach where guided outputs of a language generation model, e.g. GPT-2, when labeled, can improve the performance of text classifiers through an active learning process. We transform the data generation task into an optimization problem which maximizes the usefulness of the generated output, using Monte Carlo Tree Search (MCTS) as the optimization strategy and incorporating entropy as one of the optimization criteria. We test our approach against a Non-Guided Data Generation (NGDG) process that does not optimize for a reward function. Starting with a small set of data, our results show an increased performance with MCTS of 26% on the TREC-6 Questions dataset, and 10% on the Stanford Sentiment Treebank SST-2 dataset. Compared with NGDG, we are able to achieve increases of 3% and 5% on TREC-6 and SST-2.
%R 10.18653/v1/2020.emnlp-main.600
%U https://aclanthology.org/2020.emnlp-main.600
%U https://doi.org/10.18653/v1/2020.emnlp-main.600
%P 7400-7410
Markdown (Informal)
[Textual Data Augmentation for Efficient Active Learning on Tiny Datasets](https://aclanthology.org/2020.emnlp-main.600) (Quteineh et al., EMNLP 2020)
ACL