@inproceedings{steglich-poppe-2026-active,
title = "Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis",
author = "Steglich, Jakob and
Poppe, Stephan",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-srw.70/",
pages = "952--966",
ISBN = "979-8-89176-383-8",
abstract = "Quantitative text analysis relies on high-quality corpora, but keyword-based collection often retrieves irrelevant material, undermining validity. We show that active learning with a transformer-based classifier can iteratively refine corpora by excluding irrelevant documents, prompting researchers to clarify inclusion criteria and address edge cases. Applied to German newspaper articles on depression and schizophrenia, this approach improves construct validity and reduces labeling effort. The document relevance classifiers reached an F1-score of 0.8 with just 100{--}150 labeled snippets, with further gains from tuning, outperforming both random sampling and a weakly supervised sampling baseline. Filtering non-medical articles further had little effect on downstream depression stigmatization measures but increased schizophrenia stigmatization. Active learning thus enables efficient corpus validation and clearer concept boundaries with minimal preprocessing. The source code is publicly available at https://github.com/jakobstgl/active-learning-corpus-refinement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steglich-poppe-2026-active">
<titleInfo>
<title>Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Steglich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephan</namePart>
<namePart type="family">Poppe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="family">Baez Santamaria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Ashish</namePart>
<namePart type="family">Somayajula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsuki</namePart>
<namePart type="family">Yamaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-383-8</identifier>
</relatedItem>
<abstract>Quantitative text analysis relies on high-quality corpora, but keyword-based collection often retrieves irrelevant material, undermining validity. We show that active learning with a transformer-based classifier can iteratively refine corpora by excluding irrelevant documents, prompting researchers to clarify inclusion criteria and address edge cases. Applied to German newspaper articles on depression and schizophrenia, this approach improves construct validity and reduces labeling effort. The document relevance classifiers reached an F1-score of 0.8 with just 100–150 labeled snippets, with further gains from tuning, outperforming both random sampling and a weakly supervised sampling baseline. Filtering non-medical articles further had little effect on downstream depression stigmatization measures but increased schizophrenia stigmatization. Active learning thus enables efficient corpus validation and clearer concept boundaries with minimal preprocessing. The source code is publicly available at https://github.com/jakobstgl/active-learning-corpus-refinement.</abstract>
<identifier type="citekey">steglich-poppe-2026-active</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-srw.70/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>952</start>
<end>966</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis
%A Steglich, Jakob
%A Poppe, Stephan
%Y Baez Santamaria, Selene
%Y Somayajula, Sai Ashish
%Y Yamaguchi, Atsuki
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-383-8
%F steglich-poppe-2026-active
%X Quantitative text analysis relies on high-quality corpora, but keyword-based collection often retrieves irrelevant material, undermining validity. We show that active learning with a transformer-based classifier can iteratively refine corpora by excluding irrelevant documents, prompting researchers to clarify inclusion criteria and address edge cases. Applied to German newspaper articles on depression and schizophrenia, this approach improves construct validity and reduces labeling effort. The document relevance classifiers reached an F1-score of 0.8 with just 100–150 labeled snippets, with further gains from tuning, outperforming both random sampling and a weakly supervised sampling baseline. Filtering non-medical articles further had little effect on downstream depression stigmatization measures but increased schizophrenia stigmatization. Active learning thus enables efficient corpus validation and clearer concept boundaries with minimal preprocessing. The source code is publicly available at https://github.com/jakobstgl/active-learning-corpus-refinement.
%U https://aclanthology.org/2026.eacl-srw.70/
%P 952-966
Markdown (Informal)
[Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis](https://aclanthology.org/2026.eacl-srw.70/) (Steglich & Poppe, EACL 2026)
ACL