@inproceedings{van-nooten-daelemans-2023-improving,
title = "Improving {D}utch Vaccine Hesitancy Monitoring via Multi-Label Data Augmentation with {GPT}-3.5",
author = "Van Nooten, Jens and
Daelemans, Walter",
editor = "Barnes, Jeremy and
De Clercq, Orph{\'e}e and
Klinger, Roman",
booktitle = "Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, {\&} Social Media Analysis",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.wassa-1.23",
doi = "10.18653/v1/2023.wassa-1.23",
pages = "251--270",
abstract = "In this paper, we leverage the GPT-3.5 language model both using the Chat-GPT API interface and the GPT-3.5 API interface to generate realistic examples of anti-vaccination tweets in Dutch with the aim of augmenting an imbalanced multi-label vaccine hesitancy argumentation classification dataset. In line with previous research, we devise a prompt that, on the one hand, instructs the model to generate realistic examples based on the gold standard dataset and, on the other hand, to assign multiple pseudo-labels (or a single pseudo-label) to the generated instances. We then augment our gold standard data with the generated examples and evaluate the impact thereof in a cross-validation setting with several state-of-the-art Dutch large language models. This augmentation technique predominantly shows improvements in F1 for classifying underrepresented classes while increasing the overall recall, paired with a slight decrease in precision for more common classes. Furthermore, we examine how well the synthetic data generalises to human data in the classification task. To our knowledge, we are the first to utilise Chat-GPT and GPT-3.5 for augmenting a Dutch multi-label dataset classification task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-nooten-daelemans-2023-improving">
<titleInfo>
<title>Improving Dutch Vaccine Hesitancy Monitoring via Multi-Label Data Augmentation with GPT-3.5</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">Van Nooten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orphée</namePart>
<namePart type="family">De Clercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we leverage the GPT-3.5 language model both using the Chat-GPT API interface and the GPT-3.5 API interface to generate realistic examples of anti-vaccination tweets in Dutch with the aim of augmenting an imbalanced multi-label vaccine hesitancy argumentation classification dataset. In line with previous research, we devise a prompt that, on the one hand, instructs the model to generate realistic examples based on the gold standard dataset and, on the other hand, to assign multiple pseudo-labels (or a single pseudo-label) to the generated instances. We then augment our gold standard data with the generated examples and evaluate the impact thereof in a cross-validation setting with several state-of-the-art Dutch large language models. This augmentation technique predominantly shows improvements in F1 for classifying underrepresented classes while increasing the overall recall, paired with a slight decrease in precision for more common classes. Furthermore, we examine how well the synthetic data generalises to human data in the classification task. To our knowledge, we are the first to utilise Chat-GPT and GPT-3.5 for augmenting a Dutch multi-label dataset classification task.</abstract>
<identifier type="citekey">van-nooten-daelemans-2023-improving</identifier>
<identifier type="doi">10.18653/v1/2023.wassa-1.23</identifier>
<location>
<url>https://aclanthology.org/2023.wassa-1.23</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>251</start>
<end>270</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Dutch Vaccine Hesitancy Monitoring via Multi-Label Data Augmentation with GPT-3.5
%A Van Nooten, Jens
%A Daelemans, Walter
%Y Barnes, Jeremy
%Y De Clercq, Orphée
%Y Klinger, Roman
%S Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F van-nooten-daelemans-2023-improving
%X In this paper, we leverage the GPT-3.5 language model both using the Chat-GPT API interface and the GPT-3.5 API interface to generate realistic examples of anti-vaccination tweets in Dutch with the aim of augmenting an imbalanced multi-label vaccine hesitancy argumentation classification dataset. In line with previous research, we devise a prompt that, on the one hand, instructs the model to generate realistic examples based on the gold standard dataset and, on the other hand, to assign multiple pseudo-labels (or a single pseudo-label) to the generated instances. We then augment our gold standard data with the generated examples and evaluate the impact thereof in a cross-validation setting with several state-of-the-art Dutch large language models. This augmentation technique predominantly shows improvements in F1 for classifying underrepresented classes while increasing the overall recall, paired with a slight decrease in precision for more common classes. Furthermore, we examine how well the synthetic data generalises to human data in the classification task. To our knowledge, we are the first to utilise Chat-GPT and GPT-3.5 for augmenting a Dutch multi-label dataset classification task.
%R 10.18653/v1/2023.wassa-1.23
%U https://aclanthology.org/2023.wassa-1.23
%U https://doi.org/10.18653/v1/2023.wassa-1.23
%P 251-270
Markdown (Informal)
[Improving Dutch Vaccine Hesitancy Monitoring via Multi-Label Data Augmentation with GPT-3.5](https://aclanthology.org/2023.wassa-1.23) (Van Nooten & Daelemans, WASSA 2023)
ACL