@inproceedings{schaller-etal-2025-dont,
title = "Don{'}t Score too Early! Evaluating Argument Mining Models on Incomplete Essays",
author = "Schaller, Nils-Jonathan and
Ding, Yuning and
Jansen, Thorben and
Horbach, Andrea",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bea-1.27/",
doi = "10.18653/v1/2025.bea-1.27",
pages = "345--355",
ISBN = "979-8-89176-270-1",
abstract = "Students' argumentative writing benefits from receiving automated feedback, particularly throughout the writing process. While Argument Mining (AM) technology shows promise for delivering automated feedback on argumentative structures, existing systems are frequently trained on completed essays, providing rich context information and raising concerns about their usefulness for offering writing support on incomplete texts during the writing process. This study evaluates the robustness of AM algorithms on artificially fragmented learner texts from two large-scale corpora of secondary school essays: the German DARIUS corpus and the English PERSUADE corpus. Our analysis reveals that token-level sequence-tagging methods, while highly effective on complete essays, suffer significantly when context is limited or misleading. Conversely, sentence-level classifiers maintain relative stability under such conditions. We show that deliberately training AM models on fragmented input substantially mitigates these context-related weaknesses, enabling AM systems to support dynamic educational writing scenarios better."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schaller-etal-2025-dont">
<titleInfo>
<title>Don’t Score too Early! Evaluating Argument Mining Models on Incomplete Essays</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nils-Jonathan</namePart>
<namePart type="family">Schaller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuning</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thorben</namePart>
<namePart type="family">Jansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Horbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Bexte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Horbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Laarmann-Quante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Yaneva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-270-1</identifier>
</relatedItem>
<abstract>Students’ argumentative writing benefits from receiving automated feedback, particularly throughout the writing process. While Argument Mining (AM) technology shows promise for delivering automated feedback on argumentative structures, existing systems are frequently trained on completed essays, providing rich context information and raising concerns about their usefulness for offering writing support on incomplete texts during the writing process. This study evaluates the robustness of AM algorithms on artificially fragmented learner texts from two large-scale corpora of secondary school essays: the German DARIUS corpus and the English PERSUADE corpus. Our analysis reveals that token-level sequence-tagging methods, while highly effective on complete essays, suffer significantly when context is limited or misleading. Conversely, sentence-level classifiers maintain relative stability under such conditions. We show that deliberately training AM models on fragmented input substantially mitigates these context-related weaknesses, enabling AM systems to support dynamic educational writing scenarios better.</abstract>
<identifier type="citekey">schaller-etal-2025-dont</identifier>
<identifier type="doi">10.18653/v1/2025.bea-1.27</identifier>
<location>
<url>https://aclanthology.org/2025.bea-1.27/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>345</start>
<end>355</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Don’t Score too Early! Evaluating Argument Mining Models on Incomplete Essays
%A Schaller, Nils-Jonathan
%A Ding, Yuning
%A Jansen, Thorben
%A Horbach, Andrea
%Y Kochmar, Ekaterina
%Y Alhafni, Bashar
%Y Bexte, Marie
%Y Burstein, Jill
%Y Horbach, Andrea
%Y Laarmann-Quante, Ronja
%Y Tack, Anaïs
%Y Yaneva, Victoria
%Y Yuan, Zheng
%S Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-270-1
%F schaller-etal-2025-dont
%X Students’ argumentative writing benefits from receiving automated feedback, particularly throughout the writing process. While Argument Mining (AM) technology shows promise for delivering automated feedback on argumentative structures, existing systems are frequently trained on completed essays, providing rich context information and raising concerns about their usefulness for offering writing support on incomplete texts during the writing process. This study evaluates the robustness of AM algorithms on artificially fragmented learner texts from two large-scale corpora of secondary school essays: the German DARIUS corpus and the English PERSUADE corpus. Our analysis reveals that token-level sequence-tagging methods, while highly effective on complete essays, suffer significantly when context is limited or misleading. Conversely, sentence-level classifiers maintain relative stability under such conditions. We show that deliberately training AM models on fragmented input substantially mitigates these context-related weaknesses, enabling AM systems to support dynamic educational writing scenarios better.
%R 10.18653/v1/2025.bea-1.27
%U https://aclanthology.org/2025.bea-1.27/
%U https://doi.org/10.18653/v1/2025.bea-1.27
%P 345-355
Markdown (Informal)
[Don’t Score too Early! Evaluating Argument Mining Models on Incomplete Essays](https://aclanthology.org/2025.bea-1.27/) (Schaller et al., BEA 2025)
ACL