@inproceedings{bayan-memar-etal-2026-ellat,
title = "Ellat at {S}em{E}val-2026 Task 11: Comparing Encoder and Decoder Models for Syllogistic Reasoning",
author = "Bayan Memar, Farzaneh and
Huls, Hanneke and
Ten Hove, Matthijs",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.270/",
pages = "2130--2138",
ISBN = "979-8-89176-414-9",
abstract = "For SemEval-2026 Task 11 (Subtask 1: English), Team Ellat investigates whether language models can assess logical validity independently of semantic plausibility. Since these models learn statistical patterns instead of explicit logical rules, they often rely on world knowledge and semantic shortcuts rather than formal logic. To address this challenge, we evaluate three architectures: MiniLM-L6-mnli-binary, DeBERTa-v3-small, and Llama 3.1-8B-Instruct, applying task-specific fine-tuning for encoder models and Abstract Logic Augmentation with QLoRA for LLaMA. DeBERTa achieved the strongest overall performance, MiniLM showed clear reductions in content bias after fine-tuning, and Llama 3.1-8B exhibited strong plausibility bias in the zero-shot setting. However, our augmented fine-tuning approach led to only modest improvements and a partial shift toward structure-based reasoning. Overall, fine-tuning and abstraction-based augmentation reduce plausibility bias, but fully separating logical validity from semantic content remains challenging across architectures."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bayan-memar-etal-2026-ellat">
<titleInfo>
<title>Ellat at SemEval-2026 Task 11: Comparing Encoder and Decoder Models for Syllogistic Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Farzaneh</namePart>
<namePart type="family">Bayan Memar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanneke</namePart>
<namePart type="family">Huls</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthijs</namePart>
<namePart type="family">Ten Hove</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>For SemEval-2026 Task 11 (Subtask 1: English), Team Ellat investigates whether language models can assess logical validity independently of semantic plausibility. Since these models learn statistical patterns instead of explicit logical rules, they often rely on world knowledge and semantic shortcuts rather than formal logic. To address this challenge, we evaluate three architectures: MiniLM-L6-mnli-binary, DeBERTa-v3-small, and Llama 3.1-8B-Instruct, applying task-specific fine-tuning for encoder models and Abstract Logic Augmentation with QLoRA for LLaMA. DeBERTa achieved the strongest overall performance, MiniLM showed clear reductions in content bias after fine-tuning, and Llama 3.1-8B exhibited strong plausibility bias in the zero-shot setting. However, our augmented fine-tuning approach led to only modest improvements and a partial shift toward structure-based reasoning. Overall, fine-tuning and abstraction-based augmentation reduce plausibility bias, but fully separating logical validity from semantic content remains challenging across architectures.</abstract>
<identifier type="citekey">bayan-memar-etal-2026-ellat</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.270/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>2130</start>
<end>2138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ellat at SemEval-2026 Task 11: Comparing Encoder and Decoder Models for Syllogistic Reasoning
%A Bayan Memar, Farzaneh
%A Huls, Hanneke
%A Ten Hove, Matthijs
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F bayan-memar-etal-2026-ellat
%X For SemEval-2026 Task 11 (Subtask 1: English), Team Ellat investigates whether language models can assess logical validity independently of semantic plausibility. Since these models learn statistical patterns instead of explicit logical rules, they often rely on world knowledge and semantic shortcuts rather than formal logic. To address this challenge, we evaluate three architectures: MiniLM-L6-mnli-binary, DeBERTa-v3-small, and Llama 3.1-8B-Instruct, applying task-specific fine-tuning for encoder models and Abstract Logic Augmentation with QLoRA for LLaMA. DeBERTa achieved the strongest overall performance, MiniLM showed clear reductions in content bias after fine-tuning, and Llama 3.1-8B exhibited strong plausibility bias in the zero-shot setting. However, our augmented fine-tuning approach led to only modest improvements and a partial shift toward structure-based reasoning. Overall, fine-tuning and abstraction-based augmentation reduce plausibility bias, but fully separating logical validity from semantic content remains challenging across architectures.
%U https://aclanthology.org/2026.semeval-1.270/
%P 2130-2138
Markdown (Informal)
[Ellat at SemEval-2026 Task 11: Comparing Encoder and Decoder Models for Syllogistic Reasoning](https://aclanthology.org/2026.semeval-1.270/) (Bayan Memar et al., SemEval 2026)
ACL