@inproceedings{ayman-etal-2026-proofbusters,
title = "Proofbusters at {S}em{E}val-2026 Task 11: Neuro-Symbolic Syllogistic Reasoning via {LLM}-Guided Structure Extraction and Deterministic Validation",
author = "Ayman, Mohamed and
Marzouk, Khaled and
Mashaly, Abdallah and
Heriez, Ahmed",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.182/",
pages = "1407--1415",
ISBN = "979-8-89176-414-9",
abstract = {This paper presents the **Proofbusters** system for SemEval-2026 Task 11 (English syllogism validity classification). The task evaluates whether language models can perform *formal* syllogistic reasoning independent of semantic content{---}i.e., without being swayed by *belief bias* (judging arguments by plausibility or world knowledge instead of logical validity).The main idea is **symbolic abstraction**: before predicting validity, each syllogism is converted into a content-invariant logical form so the model reasons over structure rather than over concrete terms. Inspired by Euler{'}s abstraction in the K{\"o}nigsberg bridges problem (stripping away geography to reveal pure structure), the paper explores three abstraction strategies of increasing formal rigor:1. **Template abstraction** {---} Replace categorical terms with generic placeholders (e.g., x, y, z); keep syntax and quantifiers. Serves as a baseline (82.20{\%} accuracy).2. **Symbolic OOP abstraction** {---} Map entities and relations into an object-oriented constraint graph with explicit tracking of supersets, disjoint sets, etc. (88.84{\%} with Qwen-7B).3. **Set-theoretic abstraction** {---} Translate premises and conclusion into formal set notation (e.g., {\textbackslash}(A {\textbackslash}subseteq B{\textbackslash}), {\textbackslash}(A {\textbackslash}cap B = {\textbackslash}emptyset{\textbackslash})) and enforce *existential import* ({\textbackslash}(A, B, C {\textbackslash}neq {\textbackslash}emptyset{\textbackslash})) to align with Aristotelian logic. The solver never sees the original natural-language terms.The system uses a **two-stage pipeline**: a **Formulation** stage (natural language {\textrightarrow} symbolic representation) and a **Solver** stage (validity judgment from symbols only). The set-theoretic variant, using Gemini Flash 2.5 for formulation and Gemini Pro 2.5 for solving, achieves **98.95{\%} accuracy** with **2.13** total content effect (TCE) and an **overall score of 46.23**, substantially outperforming both task baselines and the other abstraction variants.The **conclusion** is that belief bias in LLMs is tied to semantic surface form: *explicit abstraction into mathematical set notation* sharply reduces plausibility-driven errors. Robust logical reasoning likely requires **architectural separation** between semantic parsing and formal inference, rather than prompt engineering alone. Remaining challenges include formulation errors (e.g., quantifier misclassification), multi-step constraint composition, and negation{--}inclusion interactions. Future work may combine the abstraction pipeline with formally verified theorem provers and extend it to multilingual or more complex multi-premise reasoning.}
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ayman-etal-2026-proofbusters">
<titleInfo>
<title>Proofbusters at SemEval-2026 Task 11: Neuro-Symbolic Syllogistic Reasoning via LLM-Guided Structure Extraction and Deterministic Validation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Ayman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khaled</namePart>
<namePart type="family">Marzouk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdallah</namePart>
<namePart type="family">Mashaly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Heriez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>This paper presents the **Proofbusters** system for SemEval-2026 Task 11 (English syllogism validity classification). The task evaluates whether language models can perform *formal* syllogistic reasoning independent of semantic content—i.e., without being swayed by *belief bias* (judging arguments by plausibility or world knowledge instead of logical validity).The main idea is **symbolic abstraction**: before predicting validity, each syllogism is converted into a content-invariant logical form so the model reasons over structure rather than over concrete terms. Inspired by Euler’s abstraction in the Königsberg bridges problem (stripping away geography to reveal pure structure), the paper explores three abstraction strategies of increasing formal rigor:1. **Template abstraction** — Replace categorical terms with generic placeholders (e.g., x, y, z); keep syntax and quantifiers. Serves as a baseline (82.20% accuracy).2. **Symbolic OOP abstraction** — Map entities and relations into an object-oriented constraint graph with explicit tracking of supersets, disjoint sets, etc. (88.84% with Qwen-7B).3. **Set-theoretic abstraction** — Translate premises and conclusion into formal set notation (e.g., \textbackslash(A \textbackslashsubseteq B\textbackslash), \textbackslash(A \textbackslashcap B = \textbackslashemptyset\textbackslash)) and enforce *existential import* (\textbackslash(A, B, C \textbackslashneq \textbackslashemptyset\textbackslash)) to align with Aristotelian logic. The solver never sees the original natural-language terms.The system uses a **two-stage pipeline**: a **Formulation** stage (natural language → symbolic representation) and a **Solver** stage (validity judgment from symbols only). The set-theoretic variant, using Gemini Flash 2.5 for formulation and Gemini Pro 2.5 for solving, achieves **98.95% accuracy** with **2.13** total content effect (TCE) and an **overall score of 46.23**, substantially outperforming both task baselines and the other abstraction variants.The **conclusion** is that belief bias in LLMs is tied to semantic surface form: *explicit abstraction into mathematical set notation* sharply reduces plausibility-driven errors. Robust logical reasoning likely requires **architectural separation** between semantic parsing and formal inference, rather than prompt engineering alone. Remaining challenges include formulation errors (e.g., quantifier misclassification), multi-step constraint composition, and negation–inclusion interactions. Future work may combine the abstraction pipeline with formally verified theorem provers and extend it to multilingual or more complex multi-premise reasoning.</abstract>
<identifier type="citekey">ayman-etal-2026-proofbusters</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.182/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1407</start>
<end>1415</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Proofbusters at SemEval-2026 Task 11: Neuro-Symbolic Syllogistic Reasoning via LLM-Guided Structure Extraction and Deterministic Validation
%A Ayman, Mohamed
%A Marzouk, Khaled
%A Mashaly, Abdallah
%A Heriez, Ahmed
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F ayman-etal-2026-proofbusters
%X This paper presents the **Proofbusters** system for SemEval-2026 Task 11 (English syllogism validity classification). The task evaluates whether language models can perform *formal* syllogistic reasoning independent of semantic content—i.e., without being swayed by *belief bias* (judging arguments by plausibility or world knowledge instead of logical validity).The main idea is **symbolic abstraction**: before predicting validity, each syllogism is converted into a content-invariant logical form so the model reasons over structure rather than over concrete terms. Inspired by Euler’s abstraction in the Königsberg bridges problem (stripping away geography to reveal pure structure), the paper explores three abstraction strategies of increasing formal rigor:1. **Template abstraction** — Replace categorical terms with generic placeholders (e.g., x, y, z); keep syntax and quantifiers. Serves as a baseline (82.20% accuracy).2. **Symbolic OOP abstraction** — Map entities and relations into an object-oriented constraint graph with explicit tracking of supersets, disjoint sets, etc. (88.84% with Qwen-7B).3. **Set-theoretic abstraction** — Translate premises and conclusion into formal set notation (e.g., \textbackslash(A \textbackslashsubseteq B\textbackslash), \textbackslash(A \textbackslashcap B = \textbackslashemptyset\textbackslash)) and enforce *existential import* (\textbackslash(A, B, C \textbackslashneq \textbackslashemptyset\textbackslash)) to align with Aristotelian logic. The solver never sees the original natural-language terms.The system uses a **two-stage pipeline**: a **Formulation** stage (natural language → symbolic representation) and a **Solver** stage (validity judgment from symbols only). The set-theoretic variant, using Gemini Flash 2.5 for formulation and Gemini Pro 2.5 for solving, achieves **98.95% accuracy** with **2.13** total content effect (TCE) and an **overall score of 46.23**, substantially outperforming both task baselines and the other abstraction variants.The **conclusion** is that belief bias in LLMs is tied to semantic surface form: *explicit abstraction into mathematical set notation* sharply reduces plausibility-driven errors. Robust logical reasoning likely requires **architectural separation** between semantic parsing and formal inference, rather than prompt engineering alone. Remaining challenges include formulation errors (e.g., quantifier misclassification), multi-step constraint composition, and negation–inclusion interactions. Future work may combine the abstraction pipeline with formally verified theorem provers and extend it to multilingual or more complex multi-premise reasoning.
%U https://aclanthology.org/2026.semeval-1.182/
%P 1407-1415
Markdown (Informal)
[Proofbusters at SemEval-2026 Task 11: Neuro-Symbolic Syllogistic Reasoning via LLM-Guided Structure Extraction and Deterministic Validation](https://aclanthology.org/2026.semeval-1.182/) (Ayman et al., SemEval 2026)
ACL