@inproceedings{krishnasamy-2026-gigitai,
title = "{G}igit{AI} at {S}em{E}val-2026 Task 11: Hybrid Symbolic-Neural Approach for Syllogistic Validity Classification",
author = "Krishnasamy, Saran",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.425/",
pages = "3432--3441",
ISBN = "979-8-89176-414-9",
abstract = "We present our system for SemEval-2026 Task 11 on classifying whether syllogisms are logically valid. The main challenge is that language models tend to judge arguments based on whether the conclusion sounds true in the real world, rather than whether it follows logically from the premises. We evaluate direct prompting across six models (GPT-4o, GPT-5.2, o3, o3-mini, Claude Opus 4.6, Claude Sonnet 4) with three prompt strategies, finding that even the best achieves only 89.5{\%} accuracy. Our best-performing system splits the task into two parts: GPT-4o-mini extracts the logical structure, then deterministic rules check validity, enhanced with bidirectional premise checking, predicate negation post-processing, and a targeted rule-based fallback for double negation. This achieves 98.95{\%} accuracy on Subtask 1 (combined score 57.74) and 85.8{\%} validity accuracy on Subtask 2. We also explore self-consistency with symbolic verification (93.1{\%}), content abstraction, activation steering, contrastive fine-tuning, RLVR, and diffusion-based reasoning, finding that content abstraction surprisingly degrades performance, revealing that semantic content provides essential parsing scaffolding alongside the bias it introduces."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krishnasamy-2026-gigitai">
<titleInfo>
<title>GigitAI at SemEval-2026 Task 11: Hybrid Symbolic-Neural Approach for Syllogistic Validity Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saran</namePart>
<namePart type="family">Krishnasamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>We present our system for SemEval-2026 Task 11 on classifying whether syllogisms are logically valid. The main challenge is that language models tend to judge arguments based on whether the conclusion sounds true in the real world, rather than whether it follows logically from the premises. We evaluate direct prompting across six models (GPT-4o, GPT-5.2, o3, o3-mini, Claude Opus 4.6, Claude Sonnet 4) with three prompt strategies, finding that even the best achieves only 89.5% accuracy. Our best-performing system splits the task into two parts: GPT-4o-mini extracts the logical structure, then deterministic rules check validity, enhanced with bidirectional premise checking, predicate negation post-processing, and a targeted rule-based fallback for double negation. This achieves 98.95% accuracy on Subtask 1 (combined score 57.74) and 85.8% validity accuracy on Subtask 2. We also explore self-consistency with symbolic verification (93.1%), content abstraction, activation steering, contrastive fine-tuning, RLVR, and diffusion-based reasoning, finding that content abstraction surprisingly degrades performance, revealing that semantic content provides essential parsing scaffolding alongside the bias it introduces.</abstract>
<identifier type="citekey">krishnasamy-2026-gigitai</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.425/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>3432</start>
<end>3441</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GigitAI at SemEval-2026 Task 11: Hybrid Symbolic-Neural Approach for Syllogistic Validity Classification
%A Krishnasamy, Saran
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F krishnasamy-2026-gigitai
%X We present our system for SemEval-2026 Task 11 on classifying whether syllogisms are logically valid. The main challenge is that language models tend to judge arguments based on whether the conclusion sounds true in the real world, rather than whether it follows logically from the premises. We evaluate direct prompting across six models (GPT-4o, GPT-5.2, o3, o3-mini, Claude Opus 4.6, Claude Sonnet 4) with three prompt strategies, finding that even the best achieves only 89.5% accuracy. Our best-performing system splits the task into two parts: GPT-4o-mini extracts the logical structure, then deterministic rules check validity, enhanced with bidirectional premise checking, predicate negation post-processing, and a targeted rule-based fallback for double negation. This achieves 98.95% accuracy on Subtask 1 (combined score 57.74) and 85.8% validity accuracy on Subtask 2. We also explore self-consistency with symbolic verification (93.1%), content abstraction, activation steering, contrastive fine-tuning, RLVR, and diffusion-based reasoning, finding that content abstraction surprisingly degrades performance, revealing that semantic content provides essential parsing scaffolding alongside the bias it introduces.
%U https://aclanthology.org/2026.semeval-1.425/
%P 3432-3441
Markdown (Informal)
[GigitAI at SemEval-2026 Task 11: Hybrid Symbolic-Neural Approach for Syllogistic Validity Classification](https://aclanthology.org/2026.semeval-1.425/) (Krishnasamy, SemEval 2026)
ACL