@inproceedings{rohan-etal-2025-bd,
title = "{BD} at {BEA} 2025 Shared Task: {MPN}et Ensembles for Pedagogical Mistake Identification and Localization in {AI} Tutor Responses",
author = "Rohan, Shadman and
Sur Apan, Ishita and
Shochcho, Muhtasim Ibteda and
Fahim, Md and
Rahman, Mohammad Ashfaq Ur and
Rahman, AKM Mahbubur and
Ali, Amin Ahsan",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bea-1.102/",
doi = "10.18653/v1/2025.bea-1.102",
pages = "1266--1277",
ISBN = "979-8-89176-270-1",
abstract = "We present Team BD{'}s submission to the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors, under Track 1 (Mistake Identification) and Track 2 (Mistake Location). Both tracks involve three-class classification of tutor responses in educational dialogues {--} determining if a tutor correctly recognizes a student{'}s mistake (Track 1) and whether the tutor pinpoints the mistake{'}s location (Track 2). Our system is built on MPNet, a Transformer-based language modelthat combines BERT and XLNet{'}s pre-training advantages. We fine-tuned MPNet on the task data using a class-weighted cross-entropy loss to handle class imbalance, and leveraged grouped cross-validation (10 folds) to maximize the use of limited data while avoiding dialogue overlap between training and validation. We then performed a hard-voting ensemble of the best models from each fold, which improves robustness and generalization by combining multiple classifiers. Ourapproach achieved strong results on both tracks, with exact-match macro-F1 scores of approximately 0.7110 for Mistake Identification and 0.5543 for Mistake Location on the official test set. We include comprehensive analysis of our system{'}s performance, including confusion matrices and t-SNE visualizations to interpret classifier behavior, as well as a taxonomy of common errors with examples. We hope our ensemble-based approach and findings provide useful insights for designing reliable tutor response evaluation systems in educational dialogue settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rohan-etal-2025-bd">
<titleInfo>
<title>BD at BEA 2025 Shared Task: MPNet Ensembles for Pedagogical Mistake Identification and Localization in AI Tutor Responses</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shadman</namePart>
<namePart type="family">Rohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ishita</namePart>
<namePart type="family">Sur Apan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhtasim</namePart>
<namePart type="given">Ibteda</namePart>
<namePart type="family">Shochcho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Fahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Ashfaq</namePart>
<namePart type="given">Ur</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">AKM</namePart>
<namePart type="given">Mahbubur</namePart>
<namePart type="family">Rahman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amin</namePart>
<namePart type="given">Ahsan</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Bexte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Horbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Laarmann-Quante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Yaneva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-270-1</identifier>
</relatedItem>
<abstract>We present Team BD’s submission to the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors, under Track 1 (Mistake Identification) and Track 2 (Mistake Location). Both tracks involve three-class classification of tutor responses in educational dialogues – determining if a tutor correctly recognizes a student’s mistake (Track 1) and whether the tutor pinpoints the mistake’s location (Track 2). Our system is built on MPNet, a Transformer-based language modelthat combines BERT and XLNet’s pre-training advantages. We fine-tuned MPNet on the task data using a class-weighted cross-entropy loss to handle class imbalance, and leveraged grouped cross-validation (10 folds) to maximize the use of limited data while avoiding dialogue overlap between training and validation. We then performed a hard-voting ensemble of the best models from each fold, which improves robustness and generalization by combining multiple classifiers. Ourapproach achieved strong results on both tracks, with exact-match macro-F1 scores of approximately 0.7110 for Mistake Identification and 0.5543 for Mistake Location on the official test set. We include comprehensive analysis of our system’s performance, including confusion matrices and t-SNE visualizations to interpret classifier behavior, as well as a taxonomy of common errors with examples. We hope our ensemble-based approach and findings provide useful insights for designing reliable tutor response evaluation systems in educational dialogue settings.</abstract>
<identifier type="citekey">rohan-etal-2025-bd</identifier>
<identifier type="doi">10.18653/v1/2025.bea-1.102</identifier>
<location>
<url>https://aclanthology.org/2025.bea-1.102/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1266</start>
<end>1277</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BD at BEA 2025 Shared Task: MPNet Ensembles for Pedagogical Mistake Identification and Localization in AI Tutor Responses
%A Rohan, Shadman
%A Sur Apan, Ishita
%A Shochcho, Muhtasim Ibteda
%A Fahim, Md
%A Rahman, Mohammad Ashfaq Ur
%A Rahman, AKM Mahbubur
%A Ali, Amin Ahsan
%Y Kochmar, Ekaterina
%Y Alhafni, Bashar
%Y Bexte, Marie
%Y Burstein, Jill
%Y Horbach, Andrea
%Y Laarmann-Quante, Ronja
%Y Tack, Anaïs
%Y Yaneva, Victoria
%Y Yuan, Zheng
%S Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-270-1
%F rohan-etal-2025-bd
%X We present Team BD’s submission to the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors, under Track 1 (Mistake Identification) and Track 2 (Mistake Location). Both tracks involve three-class classification of tutor responses in educational dialogues – determining if a tutor correctly recognizes a student’s mistake (Track 1) and whether the tutor pinpoints the mistake’s location (Track 2). Our system is built on MPNet, a Transformer-based language modelthat combines BERT and XLNet’s pre-training advantages. We fine-tuned MPNet on the task data using a class-weighted cross-entropy loss to handle class imbalance, and leveraged grouped cross-validation (10 folds) to maximize the use of limited data while avoiding dialogue overlap between training and validation. We then performed a hard-voting ensemble of the best models from each fold, which improves robustness and generalization by combining multiple classifiers. Ourapproach achieved strong results on both tracks, with exact-match macro-F1 scores of approximately 0.7110 for Mistake Identification and 0.5543 for Mistake Location on the official test set. We include comprehensive analysis of our system’s performance, including confusion matrices and t-SNE visualizations to interpret classifier behavior, as well as a taxonomy of common errors with examples. We hope our ensemble-based approach and findings provide useful insights for designing reliable tutor response evaluation systems in educational dialogue settings.
%R 10.18653/v1/2025.bea-1.102
%U https://aclanthology.org/2025.bea-1.102/
%U https://doi.org/10.18653/v1/2025.bea-1.102
%P 1266-1277
Markdown (Informal)
[BD at BEA 2025 Shared Task: MPNet Ensembles for Pedagogical Mistake Identification and Localization in AI Tutor Responses](https://aclanthology.org/2025.bea-1.102/) (Rohan et al., BEA 2025)
ACL