@inproceedings{silveira-maua-2026-neuro,
title = "Neuro-symbolic Approaches for Rubric-Based Automatic Essay Evaluation of {ENEM} Essays",
author = "Silveira, Igor Cataneo and
Mau{\'a}, Denis Deratani",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.78/",
pages = "790--799",
ISBN = "979-8-89176-387-6",
abstract = "Trait-specific automated scoring of essays written for the standardized Brazilian National Entrance Exam (ENEM) has received significant attention in recent years. The task is both important in a classroom setting, to provide timely and personalized learning feedback, and in the official exam, to make the scoring process more scalable and consistent. The state-of-the-art systems approach the task as a purely statistical predictive task, ignoring the knowledge provided to human graders and test takers in the form of rubrics and guidelines.Aiming to produce more interpretable and informative formative feedback in this work, we leverage the official ENEM Grader{'}s handbook and develop two neuro-symbolic approaches to trait-specific essay scoring.The first approach uses a Large Language Model (GPT4o) to write an evaluative explanation of the essay score according to the subcriteria described in the guidelines; the explanation is then fed into a statistical model to effectively predict the score; the good performance of the scoring validates the quality of the explanations.The second approach formalizes the Guideline grading rubrics as logical rules that derive the essay score as a function of subcriteria, mimicking the recommended human grader{'}s scoring approach.In order to provide weak supervision in training and to evaluate the quality of the model, we build a dataset of 63 essays annotated with their subcriteria by two expert human graders.Our empirical results suggest that both approaches perform on par with purely statistical methods while providing more helpful and fine-grained feedback."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="silveira-maua-2026-neuro">
<titleInfo>
<title>Neuro-symbolic Approaches for Rubric-Based Automatic Essay Evaluation of ENEM Essays</title>
</titleInfo>
<name type="personal">
<namePart type="given">Igor</namePart>
<namePart type="given">Cataneo</namePart>
<namePart type="family">Silveira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="given">Deratani</namePart>
<namePart type="family">Mauá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Trait-specific automated scoring of essays written for the standardized Brazilian National Entrance Exam (ENEM) has received significant attention in recent years. The task is both important in a classroom setting, to provide timely and personalized learning feedback, and in the official exam, to make the scoring process more scalable and consistent. The state-of-the-art systems approach the task as a purely statistical predictive task, ignoring the knowledge provided to human graders and test takers in the form of rubrics and guidelines.Aiming to produce more interpretable and informative formative feedback in this work, we leverage the official ENEM Grader’s handbook and develop two neuro-symbolic approaches to trait-specific essay scoring.The first approach uses a Large Language Model (GPT4o) to write an evaluative explanation of the essay score according to the subcriteria described in the guidelines; the explanation is then fed into a statistical model to effectively predict the score; the good performance of the scoring validates the quality of the explanations.The second approach formalizes the Guideline grading rubrics as logical rules that derive the essay score as a function of subcriteria, mimicking the recommended human grader’s scoring approach.In order to provide weak supervision in training and to evaluate the quality of the model, we build a dataset of 63 essays annotated with their subcriteria by two expert human graders.Our empirical results suggest that both approaches perform on par with purely statistical methods while providing more helpful and fine-grained feedback.</abstract>
<identifier type="citekey">silveira-maua-2026-neuro</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.78/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>790</start>
<end>799</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Neuro-symbolic Approaches for Rubric-Based Automatic Essay Evaluation of ENEM Essays
%A Silveira, Igor Cataneo
%A Mauá, Denis Deratani
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F silveira-maua-2026-neuro
%X Trait-specific automated scoring of essays written for the standardized Brazilian National Entrance Exam (ENEM) has received significant attention in recent years. The task is both important in a classroom setting, to provide timely and personalized learning feedback, and in the official exam, to make the scoring process more scalable and consistent. The state-of-the-art systems approach the task as a purely statistical predictive task, ignoring the knowledge provided to human graders and test takers in the form of rubrics and guidelines.Aiming to produce more interpretable and informative formative feedback in this work, we leverage the official ENEM Grader’s handbook and develop two neuro-symbolic approaches to trait-specific essay scoring.The first approach uses a Large Language Model (GPT4o) to write an evaluative explanation of the essay score according to the subcriteria described in the guidelines; the explanation is then fed into a statistical model to effectively predict the score; the good performance of the scoring validates the quality of the explanations.The second approach formalizes the Guideline grading rubrics as logical rules that derive the essay score as a function of subcriteria, mimicking the recommended human grader’s scoring approach.In order to provide weak supervision in training and to evaluate the quality of the model, we build a dataset of 63 essays annotated with their subcriteria by two expert human graders.Our empirical results suggest that both approaches perform on par with purely statistical methods while providing more helpful and fine-grained feedback.
%U https://aclanthology.org/2026.propor-1.78/
%P 790-799
Markdown (Informal)
[Neuro-symbolic Approaches for Rubric-Based Automatic Essay Evaluation of ENEM Essays](https://aclanthology.org/2026.propor-1.78/) (Silveira & Mauá, PROPOR 2026)
ACL