@inproceedings{kochmar-etal-2025-findings,
title = "Findings of the {BEA} 2025 Shared Task on Pedagogical Ability Assessment of {AI}-powered Tutors",
author = {Kochmar, Ekaterina and
Maurya, Kaushal and
Petukhova, Kseniia and
Srivatsa, KV Aditya and
Tack, Ana{\"i}s and
Vasselli, Justin},
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bea-1.77/",
doi = "10.18653/v1/2025.bea-1.77",
pages = "1011--1033",
ISBN = "979-8-89176-270-1",
abstract = "This shared task has aimed to assess pedagogical abilities of AI tutors powered by large language models (LLMs), focusing on evaluating the quality of tutor responses aimed at student{'}s mistake remediation within educational dialogues. The task consisted of five tracks designed to automatically evaluate the AI tutor{'}s performance across key dimensions of mistake identification, precise location of the mistake, providing guidance, and feedback actionability, grounded in learning science principles that define good and effective tutor responses, as well as the track focusing on detection of the tutor identity. The task attracted over 50 international teams across all tracks. The submitted models were evaluated against gold-standard human annotations, and the results, while promising, show that there is still significant room for improvement in this domain: the best results for the four pedagogical ability assessment tracks range between macro F1 scores of 58.34 (for providing guidance) and 71.81 (for mistake identification) on three-class problems, with the best F1 score in the tutor identification track reaching 96.98 on a 9-class task. In this paper, we overview the main findings of the shared task, discuss the approaches taken by the teams, and analyze their performance. All resources associated with this task are made publicly available to support futureresearch in this critical domain (https://github.com/kaushal0494/UnifyingAITutorEvaluation/tree/main/BEA{\_}Shared{\_}Task{\_}2025{\_}Datasets)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kochmar-etal-2025-findings">
<titleInfo>
<title>Findings of the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaushal</namePart>
<namePart type="family">Maurya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kseniia</namePart>
<namePart type="family">Petukhova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">KV</namePart>
<namePart type="given">Aditya</namePart>
<namePart type="family">Srivatsa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justin</namePart>
<namePart type="family">Vasselli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Bexte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Horbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Laarmann-Quante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anaïs</namePart>
<namePart type="family">Tack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victoria</namePart>
<namePart type="family">Yaneva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-270-1</identifier>
</relatedItem>
<abstract>This shared task has aimed to assess pedagogical abilities of AI tutors powered by large language models (LLMs), focusing on evaluating the quality of tutor responses aimed at student’s mistake remediation within educational dialogues. The task consisted of five tracks designed to automatically evaluate the AI tutor’s performance across key dimensions of mistake identification, precise location of the mistake, providing guidance, and feedback actionability, grounded in learning science principles that define good and effective tutor responses, as well as the track focusing on detection of the tutor identity. The task attracted over 50 international teams across all tracks. The submitted models were evaluated against gold-standard human annotations, and the results, while promising, show that there is still significant room for improvement in this domain: the best results for the four pedagogical ability assessment tracks range between macro F1 scores of 58.34 (for providing guidance) and 71.81 (for mistake identification) on three-class problems, with the best F1 score in the tutor identification track reaching 96.98 on a 9-class task. In this paper, we overview the main findings of the shared task, discuss the approaches taken by the teams, and analyze their performance. All resources associated with this task are made publicly available to support futureresearch in this critical domain (https://github.com/kaushal0494/UnifyingAITutorEvaluation/tree/main/BEA_Shared_Task_2025_Datasets).</abstract>
<identifier type="citekey">kochmar-etal-2025-findings</identifier>
<identifier type="doi">10.18653/v1/2025.bea-1.77</identifier>
<location>
<url>https://aclanthology.org/2025.bea-1.77/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1011</start>
<end>1033</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Findings of the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors
%A Kochmar, Ekaterina
%A Maurya, Kaushal
%A Petukhova, Kseniia
%A Srivatsa, KV Aditya
%A Tack, Anaïs
%A Vasselli, Justin
%Y Kochmar, Ekaterina
%Y Alhafni, Bashar
%Y Bexte, Marie
%Y Burstein, Jill
%Y Horbach, Andrea
%Y Laarmann-Quante, Ronja
%Y Tack, Anaïs
%Y Yaneva, Victoria
%Y Yuan, Zheng
%S Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-270-1
%F kochmar-etal-2025-findings
%X This shared task has aimed to assess pedagogical abilities of AI tutors powered by large language models (LLMs), focusing on evaluating the quality of tutor responses aimed at student’s mistake remediation within educational dialogues. The task consisted of five tracks designed to automatically evaluate the AI tutor’s performance across key dimensions of mistake identification, precise location of the mistake, providing guidance, and feedback actionability, grounded in learning science principles that define good and effective tutor responses, as well as the track focusing on detection of the tutor identity. The task attracted over 50 international teams across all tracks. The submitted models were evaluated against gold-standard human annotations, and the results, while promising, show that there is still significant room for improvement in this domain: the best results for the four pedagogical ability assessment tracks range between macro F1 scores of 58.34 (for providing guidance) and 71.81 (for mistake identification) on three-class problems, with the best F1 score in the tutor identification track reaching 96.98 on a 9-class task. In this paper, we overview the main findings of the shared task, discuss the approaches taken by the teams, and analyze their performance. All resources associated with this task are made publicly available to support futureresearch in this critical domain (https://github.com/kaushal0494/UnifyingAITutorEvaluation/tree/main/BEA_Shared_Task_2025_Datasets).
%R 10.18653/v1/2025.bea-1.77
%U https://aclanthology.org/2025.bea-1.77/
%U https://doi.org/10.18653/v1/2025.bea-1.77
%P 1011-1033
Markdown (Informal)
[Findings of the BEA 2025 Shared Task on Pedagogical Ability Assessment of AI-powered Tutors](https://aclanthology.org/2025.bea-1.77/) (Kochmar et al., BEA 2025)
ACL