@inproceedings{avais-etal-2026-textual,
title = "Textual Inference in {P}ortuguese: Comparing Language Models",
author = "Avais, Fabiana and
Paiva, Valeria de and
Real, Livy",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 2",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-2.28/",
pages = "201--209",
ISBN = "979-8-89176-387-6",
abstract = "Large language models (LLMs) are increasingly used for Natural Language Inference (NLI), yet their ability to perform logic-sensitive semantic reasoning, especially outside English, remains underexplored. This paper presents a preliminary investigation into the feasibility and usefulness of developing FraCaS-BR, a Portuguese adaptation of the FraCaS benchmark for semantic inference. Using a small diagnostic subset of seven FraCaS problems focusing on generalized quantifiers, plurals, and nominal anaphora, we evaluate the behavior of three LLMs (ChatGPT, Maritalk, and Evaristo) on Brazilian Portuguese translations. Each problem is submitted multiple times to assess correctness, variance, and consistency relative to the original FraCaS gold labels. The results reveal systematic differences across models.While ChatGPT shows higher overall correctness and stability, all models exhibit limitations that undermine their reliability on logic-controlled inference tasks. The extent of manual correction required during translation further underscores the necessity of human-in-the-loop evaluation. Taken together, these findings support and motivate the development of FraCaS-BR as a controlled evaluation resource for assessing semantic reasoning in Portuguese."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="avais-etal-2026-textual">
<titleInfo>
<title>Textual Inference in Portuguese: Comparing Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabiana</namePart>
<namePart type="family">Avais</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="given">de</namePart>
<namePart type="family">Paiva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Livy</namePart>
<namePart type="family">Real</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are increasingly used for Natural Language Inference (NLI), yet their ability to perform logic-sensitive semantic reasoning, especially outside English, remains underexplored. This paper presents a preliminary investigation into the feasibility and usefulness of developing FraCaS-BR, a Portuguese adaptation of the FraCaS benchmark for semantic inference. Using a small diagnostic subset of seven FraCaS problems focusing on generalized quantifiers, plurals, and nominal anaphora, we evaluate the behavior of three LLMs (ChatGPT, Maritalk, and Evaristo) on Brazilian Portuguese translations. Each problem is submitted multiple times to assess correctness, variance, and consistency relative to the original FraCaS gold labels. The results reveal systematic differences across models.While ChatGPT shows higher overall correctness and stability, all models exhibit limitations that undermine their reliability on logic-controlled inference tasks. The extent of manual correction required during translation further underscores the necessity of human-in-the-loop evaluation. Taken together, these findings support and motivate the development of FraCaS-BR as a controlled evaluation resource for assessing semantic reasoning in Portuguese.</abstract>
<identifier type="citekey">avais-etal-2026-textual</identifier>
<location>
<url>https://aclanthology.org/2026.propor-2.28/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>201</start>
<end>209</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Textual Inference in Portuguese: Comparing Language Models
%A Avais, Fabiana
%A Paiva, Valeria de
%A Real, Livy
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F avais-etal-2026-textual
%X Large language models (LLMs) are increasingly used for Natural Language Inference (NLI), yet their ability to perform logic-sensitive semantic reasoning, especially outside English, remains underexplored. This paper presents a preliminary investigation into the feasibility and usefulness of developing FraCaS-BR, a Portuguese adaptation of the FraCaS benchmark for semantic inference. Using a small diagnostic subset of seven FraCaS problems focusing on generalized quantifiers, plurals, and nominal anaphora, we evaluate the behavior of three LLMs (ChatGPT, Maritalk, and Evaristo) on Brazilian Portuguese translations. Each problem is submitted multiple times to assess correctness, variance, and consistency relative to the original FraCaS gold labels. The results reveal systematic differences across models.While ChatGPT shows higher overall correctness and stability, all models exhibit limitations that undermine their reliability on logic-controlled inference tasks. The extent of manual correction required during translation further underscores the necessity of human-in-the-loop evaluation. Taken together, these findings support and motivate the development of FraCaS-BR as a controlled evaluation resource for assessing semantic reasoning in Portuguese.
%U https://aclanthology.org/2026.propor-2.28/
%P 201-209
Markdown (Informal)
[Textual Inference in Portuguese: Comparing Language Models](https://aclanthology.org/2026.propor-2.28/) (Avais et al., PROPOR 2026)
ACL
- Fabiana Avais, Valeria de Paiva, and Livy Real. 2026. Textual Inference in Portuguese: Comparing Language Models. In Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2, pages 201–209, Salvador, Brazil. Association for Computational Linguistics.