@inproceedings{wang-yoshinaga-2023-summarization,
title = "Summarization-based Data Augmentation for Document Classification",
author = "Wang, Yueguan and
Yoshinaga, Naoki",
editor = "Dong, Yue and
Xiao, Wen and
Wang, Lu and
Liu, Fei and
Carenini, Giuseppe",
booktitle = "Proceedings of the 4th New Frontiers in Summarization Workshop",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.newsum-1.5",
doi = "10.18653/v1/2023.newsum-1.5",
pages = "49--55",
abstract = "Despite the prevalence of pretrained language models in natural language understanding tasks, understanding lengthy text such as document is still challenging due to the data sparseness problem. Inspired by that humans develop their ability of understanding lengthy text form reading shorter text, we propose a simple yet effective summarization-based data augmentation, SUMMaug, for document classification. We first obtain easy-to-learn examples for the target document classification task by summarizing the input of the original training examples, while optionally merging the original labels to conform to the summarized input. We then use the generated pseudo examples to perform curriculum learning. Experimental results on two datasets confirmed the advantage of our method compared to existing baseline methods in terms of robustness and accuracy. We release our code and data at https://github.com/etsurin/summaug.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-yoshinaga-2023-summarization">
<titleInfo>
<title>Summarization-based Data Augmentation for Document Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yueguan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoki</namePart>
<namePart type="family">Yoshinaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th New Frontiers in Summarization Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Carenini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the prevalence of pretrained language models in natural language understanding tasks, understanding lengthy text such as document is still challenging due to the data sparseness problem. Inspired by that humans develop their ability of understanding lengthy text form reading shorter text, we propose a simple yet effective summarization-based data augmentation, SUMMaug, for document classification. We first obtain easy-to-learn examples for the target document classification task by summarizing the input of the original training examples, while optionally merging the original labels to conform to the summarized input. We then use the generated pseudo examples to perform curriculum learning. Experimental results on two datasets confirmed the advantage of our method compared to existing baseline methods in terms of robustness and accuracy. We release our code and data at https://github.com/etsurin/summaug.</abstract>
<identifier type="citekey">wang-yoshinaga-2023-summarization</identifier>
<identifier type="doi">10.18653/v1/2023.newsum-1.5</identifier>
<location>
<url>https://aclanthology.org/2023.newsum-1.5</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>49</start>
<end>55</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Summarization-based Data Augmentation for Document Classification
%A Wang, Yueguan
%A Yoshinaga, Naoki
%Y Dong, Yue
%Y Xiao, Wen
%Y Wang, Lu
%Y Liu, Fei
%Y Carenini, Giuseppe
%S Proceedings of the 4th New Frontiers in Summarization Workshop
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wang-yoshinaga-2023-summarization
%X Despite the prevalence of pretrained language models in natural language understanding tasks, understanding lengthy text such as document is still challenging due to the data sparseness problem. Inspired by that humans develop their ability of understanding lengthy text form reading shorter text, we propose a simple yet effective summarization-based data augmentation, SUMMaug, for document classification. We first obtain easy-to-learn examples for the target document classification task by summarizing the input of the original training examples, while optionally merging the original labels to conform to the summarized input. We then use the generated pseudo examples to perform curriculum learning. Experimental results on two datasets confirmed the advantage of our method compared to existing baseline methods in terms of robustness and accuracy. We release our code and data at https://github.com/etsurin/summaug.
%R 10.18653/v1/2023.newsum-1.5
%U https://aclanthology.org/2023.newsum-1.5
%U https://doi.org/10.18653/v1/2023.newsum-1.5
%P 49-55
Markdown (Informal)
[Summarization-based Data Augmentation for Document Classification](https://aclanthology.org/2023.newsum-1.5) (Wang & Yoshinaga, NewSum 2023)
ACL