@inproceedings{lemmens-daelemans-2023-combining,
title = "Combining Active Learning and Task Adaptation with {BERT} for Cost-Effective Annotation of Social Media Datasets",
author = "Lemmens, Jens and
Daelemans, Walter",
editor = "Barnes, Jeremy and
De Clercq, Orph{\'e}e and
Klinger, Roman",
booktitle = "Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, {\&} Social Media Analysis",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.wassa-1.22",
doi = "10.18653/v1/2023.wassa-1.22",
pages = "237--250",
abstract = "Social media provide a rich source of data that can be mined and used for a wide variety of research purposes. However, annotating this data can be expensive, yet necessary for state-of-the-art pre-trained language models to achieve high prediction performance. Therefore, we combine pool-based active learning based on prediction uncertainty (an established method for reducing annotation costs) with unsupervised task adaptation through Masked Language Modeling (MLM). The results on three different datasets (two social media corpora, one benchmark dataset) show that task adaptation significantly improves results and that with only a fraction of the available training data, this approach reaches similar F1-scores as those achieved by an upper-bound baseline model fine-tuned on all training data. We hereby contribute to the scarce corpus of research on active learning with pre-trained language models and propose a cost-efficient annotation sampling and fine-tuning approach that can be applied to a wide variety of tasks and datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lemmens-daelemans-2023-combining">
<titleInfo>
<title>Combining Active Learning and Task Adaptation with BERT for Cost-Effective Annotation of Social Media Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">Lemmens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orphée</namePart>
<namePart type="family">De Clercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Social media provide a rich source of data that can be mined and used for a wide variety of research purposes. However, annotating this data can be expensive, yet necessary for state-of-the-art pre-trained language models to achieve high prediction performance. Therefore, we combine pool-based active learning based on prediction uncertainty (an established method for reducing annotation costs) with unsupervised task adaptation through Masked Language Modeling (MLM). The results on three different datasets (two social media corpora, one benchmark dataset) show that task adaptation significantly improves results and that with only a fraction of the available training data, this approach reaches similar F1-scores as those achieved by an upper-bound baseline model fine-tuned on all training data. We hereby contribute to the scarce corpus of research on active learning with pre-trained language models and propose a cost-efficient annotation sampling and fine-tuning approach that can be applied to a wide variety of tasks and datasets.</abstract>
<identifier type="citekey">lemmens-daelemans-2023-combining</identifier>
<identifier type="doi">10.18653/v1/2023.wassa-1.22</identifier>
<location>
<url>https://aclanthology.org/2023.wassa-1.22</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>237</start>
<end>250</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining Active Learning and Task Adaptation with BERT for Cost-Effective Annotation of Social Media Datasets
%A Lemmens, Jens
%A Daelemans, Walter
%Y Barnes, Jeremy
%Y De Clercq, Orphée
%Y Klinger, Roman
%S Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F lemmens-daelemans-2023-combining
%X Social media provide a rich source of data that can be mined and used for a wide variety of research purposes. However, annotating this data can be expensive, yet necessary for state-of-the-art pre-trained language models to achieve high prediction performance. Therefore, we combine pool-based active learning based on prediction uncertainty (an established method for reducing annotation costs) with unsupervised task adaptation through Masked Language Modeling (MLM). The results on three different datasets (two social media corpora, one benchmark dataset) show that task adaptation significantly improves results and that with only a fraction of the available training data, this approach reaches similar F1-scores as those achieved by an upper-bound baseline model fine-tuned on all training data. We hereby contribute to the scarce corpus of research on active learning with pre-trained language models and propose a cost-efficient annotation sampling and fine-tuning approach that can be applied to a wide variety of tasks and datasets.
%R 10.18653/v1/2023.wassa-1.22
%U https://aclanthology.org/2023.wassa-1.22
%U https://doi.org/10.18653/v1/2023.wassa-1.22
%P 237-250
Markdown (Informal)
[Combining Active Learning and Task Adaptation with BERT for Cost-Effective Annotation of Social Media Datasets](https://aclanthology.org/2023.wassa-1.22) (Lemmens & Daelemans, WASSA 2023)
ACL