@inproceedings{masid-etal-2026-auditing,
title = "Auditing the Evaluators: How Far Can Automatic Evaluation Go in Assessing {P}ortuguese Financial Texts?",
author = "Masid, Marina Ramalhete and
Assis, Gabriel and
Vianna, Daniela and
Paes, Aline and
Silva, Altigran Soares da",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.22/",
pages = "222--233",
ISBN = "979-8-89176-387-6",
abstract = "Automatic metrics are widely used to evaluate text quality across various natural language processing tasks. Despite their convenience and scalability, the extent to which these metrics reliably reflect textual quality remains an open challenge. The LLM-as-a-judge paradigm has recently emerged, aligning more closely with human judgments by using LLMs themselves as evaluators. However, there is still a gap in such evaluations across specific domains and languages, as most prior work focuses on generic task benchmarks in English. In this paper, we examine the robustness of both traditional automatic metrics and the LLM-as-a-judge approach for assessing the quality of financial commentaries in Portuguese, an underexplored task and language that has been neglected in previous work. We introduce fine-grained perturbations into the texts generated by specialists to analyze which types of noise most significantly affect evaluation outcomes, using noise-free counterparts as references. The results highlight the weaknesses of classical metrics in this specific task and the limitations of even recent evaluation paradigms, underscoring the need to develop context- and domain-sensitive."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="masid-etal-2026-auditing">
<titleInfo>
<title>Auditing the Evaluators: How Far Can Automatic Evaluation Go in Assessing Portuguese Financial Texts?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="given">Ramalhete</namePart>
<namePart type="family">Masid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Assis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniela</namePart>
<namePart type="family">Vianna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Paes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Altigran</namePart>
<namePart type="given">Soares</namePart>
<namePart type="given">da</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Automatic metrics are widely used to evaluate text quality across various natural language processing tasks. Despite their convenience and scalability, the extent to which these metrics reliably reflect textual quality remains an open challenge. The LLM-as-a-judge paradigm has recently emerged, aligning more closely with human judgments by using LLMs themselves as evaluators. However, there is still a gap in such evaluations across specific domains and languages, as most prior work focuses on generic task benchmarks in English. In this paper, we examine the robustness of both traditional automatic metrics and the LLM-as-a-judge approach for assessing the quality of financial commentaries in Portuguese, an underexplored task and language that has been neglected in previous work. We introduce fine-grained perturbations into the texts generated by specialists to analyze which types of noise most significantly affect evaluation outcomes, using noise-free counterparts as references. The results highlight the weaknesses of classical metrics in this specific task and the limitations of even recent evaluation paradigms, underscoring the need to develop context- and domain-sensitive.</abstract>
<identifier type="citekey">masid-etal-2026-auditing</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.22/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>222</start>
<end>233</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Auditing the Evaluators: How Far Can Automatic Evaluation Go in Assessing Portuguese Financial Texts?
%A Masid, Marina Ramalhete
%A Assis, Gabriel
%A Vianna, Daniela
%A Paes, Aline
%A Silva, Altigran Soares da
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F masid-etal-2026-auditing
%X Automatic metrics are widely used to evaluate text quality across various natural language processing tasks. Despite their convenience and scalability, the extent to which these metrics reliably reflect textual quality remains an open challenge. The LLM-as-a-judge paradigm has recently emerged, aligning more closely with human judgments by using LLMs themselves as evaluators. However, there is still a gap in such evaluations across specific domains and languages, as most prior work focuses on generic task benchmarks in English. In this paper, we examine the robustness of both traditional automatic metrics and the LLM-as-a-judge approach for assessing the quality of financial commentaries in Portuguese, an underexplored task and language that has been neglected in previous work. We introduce fine-grained perturbations into the texts generated by specialists to analyze which types of noise most significantly affect evaluation outcomes, using noise-free counterparts as references. The results highlight the weaknesses of classical metrics in this specific task and the limitations of even recent evaluation paradigms, underscoring the need to develop context- and domain-sensitive.
%U https://aclanthology.org/2026.propor-1.22/
%P 222-233
Markdown (Informal)
[Auditing the Evaluators: How Far Can Automatic Evaluation Go in Assessing Portuguese Financial Texts?](https://aclanthology.org/2026.propor-1.22/) (Masid et al., PROPOR 2026)
ACL