@inproceedings{yang-etal-2024-mini,
title = "Mini-{DA}: Improving Your Model Performance through Minimal Data Augmentation using {LLM}",
author = "Yang, Shuangtao and
Liu, Xiaoyi and
Dong, Xiaozheng and
Fu, Bo",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan and
Srivastava, Shashank",
booktitle = "Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.dash-1.4",
doi = "10.18653/v1/2024.dash-1.4",
pages = "25--30",
abstract = "When performing data augmentation using large language models (LLMs), the common approach is to directly generate a large number of new samples based on the original dataset, and then model is trained on the integration of augmented dataset and the original dataset. However, data generation demands extensive computational resources. In this study, we propose Mini-DA, a minimized data augmentation method that leverages the feedback from the target model during the training process to select only the most challenging samples from the validation set for augmentation. Our experimental results show in text classification task, by using as little as 13 percent of the original augmentation volume, Mini-DA can achieve performance comparable to full data augmentation for intent detection task, significantly improving data and computational resource utilization efficiency.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2024-mini">
<titleInfo>
<title>Mini-DA: Improving Your Model Performance through Minimal Data Augmentation using LLM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shuangtao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoyi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaozheng</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashank</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When performing data augmentation using large language models (LLMs), the common approach is to directly generate a large number of new samples based on the original dataset, and then model is trained on the integration of augmented dataset and the original dataset. However, data generation demands extensive computational resources. In this study, we propose Mini-DA, a minimized data augmentation method that leverages the feedback from the target model during the training process to select only the most challenging samples from the validation set for augmentation. Our experimental results show in text classification task, by using as little as 13 percent of the original augmentation volume, Mini-DA can achieve performance comparable to full data augmentation for intent detection task, significantly improving data and computational resource utilization efficiency.</abstract>
<identifier type="citekey">yang-etal-2024-mini</identifier>
<identifier type="doi">10.18653/v1/2024.dash-1.4</identifier>
<location>
<url>https://aclanthology.org/2024.dash-1.4</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>25</start>
<end>30</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mini-DA: Improving Your Model Performance through Minimal Data Augmentation using LLM
%A Yang, Shuangtao
%A Liu, Xiaoyi
%A Dong, Xiaozheng
%A Fu, Bo
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%Y Srivastava, Shashank
%S Proceedings of the Fifth Workshop on Data Science with Human-in-the-Loop (DaSH 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F yang-etal-2024-mini
%X When performing data augmentation using large language models (LLMs), the common approach is to directly generate a large number of new samples based on the original dataset, and then model is trained on the integration of augmented dataset and the original dataset. However, data generation demands extensive computational resources. In this study, we propose Mini-DA, a minimized data augmentation method that leverages the feedback from the target model during the training process to select only the most challenging samples from the validation set for augmentation. Our experimental results show in text classification task, by using as little as 13 percent of the original augmentation volume, Mini-DA can achieve performance comparable to full data augmentation for intent detection task, significantly improving data and computational resource utilization efficiency.
%R 10.18653/v1/2024.dash-1.4
%U https://aclanthology.org/2024.dash-1.4
%U https://doi.org/10.18653/v1/2024.dash-1.4
%P 25-30
Markdown (Informal)
[Mini-DA: Improving Your Model Performance through Minimal Data Augmentation using LLM](https://aclanthology.org/2024.dash-1.4) (Yang et al., DaSH-WS 2024)
ACL