@inproceedings{ribeiro-etal-2026-evaluating,
title = "Evaluating Reference-Free Summarization Quality Metrics for {P}ortuguese: A Study with Human Judgments in Financial News",
author = "Ribeiro, Jo{\~a}o Victor Assaoka and
Correia, Thomas Pires and
Requena, Jos{\'e} Vitor Souza Cardoso and
Berton, Lilian",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.89/",
pages = "899--907",
ISBN = "979-8-89176-387-6",
abstract = "Automatic summarization of financial news in Portuguese lacks reliable reference-free evaluation metrics. While LLM-as-a-Judge approaches are gaining traction, their correlation with human perception in specialized domains remains under-explored. This work evaluates the efficacy of Question Answering (QA) based metrics against a direct LLM-as-a-Judge baseline for Portuguese financial news. We propose a pipeline comparing Lexical, Binary, and Semantic (LLM-based) QA scoring methods, validated against a human ground truth of 50 news items annotated for Faithfulness and Completeness. Our results show that granular QA metrics significantly outperform the monolithic LLM-Judge in evaluating Completeness, with QA-Binary achieving the highest rank correlation ({\ensuremath{\rho}} {\ensuremath{\approx}} 0.49 with pessimistic human aggregation). For Faithfulness, we observe a strong ceiling effect in human evaluation, yet the Semantic QA metric demonstrated a ``super-human'' ability to detect subtle hallucinations (e.g., temporal shifts) missed by annotators. We conclude that decomposing evaluation into atomic QA pairs is superior to holistic judging for the Portuguese financial domain."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ribeiro-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Reference-Free Summarization Quality Metrics for Portuguese: A Study with Human Judgments in Financial News</title>
</titleInfo>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="given">Victor</namePart>
<namePart type="given">Assaoka</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="given">Pires</namePart>
<namePart type="family">Correia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="given">Vitor</namePart>
<namePart type="given">Souza</namePart>
<namePart type="given">Cardoso</namePart>
<namePart type="family">Requena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lilian</namePart>
<namePart type="family">Berton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Automatic summarization of financial news in Portuguese lacks reliable reference-free evaluation metrics. While LLM-as-a-Judge approaches are gaining traction, their correlation with human perception in specialized domains remains under-explored. This work evaluates the efficacy of Question Answering (QA) based metrics against a direct LLM-as-a-Judge baseline for Portuguese financial news. We propose a pipeline comparing Lexical, Binary, and Semantic (LLM-based) QA scoring methods, validated against a human ground truth of 50 news items annotated for Faithfulness and Completeness. Our results show that granular QA metrics significantly outperform the monolithic LLM-Judge in evaluating Completeness, with QA-Binary achieving the highest rank correlation (\ensuremathρ \ensuremath\approx 0.49 with pessimistic human aggregation). For Faithfulness, we observe a strong ceiling effect in human evaluation, yet the Semantic QA metric demonstrated a “super-human” ability to detect subtle hallucinations (e.g., temporal shifts) missed by annotators. We conclude that decomposing evaluation into atomic QA pairs is superior to holistic judging for the Portuguese financial domain.</abstract>
<identifier type="citekey">ribeiro-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.89/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>899</start>
<end>907</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Reference-Free Summarization Quality Metrics for Portuguese: A Study with Human Judgments in Financial News
%A Ribeiro, João Victor Assaoka
%A Correia, Thomas Pires
%A Requena, José Vitor Souza Cardoso
%A Berton, Lilian
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F ribeiro-etal-2026-evaluating
%X Automatic summarization of financial news in Portuguese lacks reliable reference-free evaluation metrics. While LLM-as-a-Judge approaches are gaining traction, their correlation with human perception in specialized domains remains under-explored. This work evaluates the efficacy of Question Answering (QA) based metrics against a direct LLM-as-a-Judge baseline for Portuguese financial news. We propose a pipeline comparing Lexical, Binary, and Semantic (LLM-based) QA scoring methods, validated against a human ground truth of 50 news items annotated for Faithfulness and Completeness. Our results show that granular QA metrics significantly outperform the monolithic LLM-Judge in evaluating Completeness, with QA-Binary achieving the highest rank correlation (\ensuremathρ \ensuremath\approx 0.49 with pessimistic human aggregation). For Faithfulness, we observe a strong ceiling effect in human evaluation, yet the Semantic QA metric demonstrated a “super-human” ability to detect subtle hallucinations (e.g., temporal shifts) missed by annotators. We conclude that decomposing evaluation into atomic QA pairs is superior to holistic judging for the Portuguese financial domain.
%U https://aclanthology.org/2026.propor-1.89/
%P 899-907
Markdown (Informal)
[Evaluating Reference-Free Summarization Quality Metrics for Portuguese: A Study with Human Judgments in Financial News](https://aclanthology.org/2026.propor-1.89/) (Ribeiro et al., PROPOR 2026)
ACL