@inproceedings{valentino-etal-2026-semeval,
title = "{S}em{E}val-2026 Task 11: Disentangling Content and Formal Reasoning in Large Language Models",
author = "Valentino, Marco and
Ranaldi, Leonardo and
Pucci, Giulia and
Ranaldi, Federico and
Freitas, Andr{\'e}",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.450/",
pages = "3716--3730",
ISBN = "979-8-89176-414-9",
abstract = "SemEval-2026 Task 11 evaluates the ability of Large Language Models (LLMs) to perform content-independent reasoning through a novel multilingual syllogistic dataset designed to measure the ``content effect'' {---} the tendency to conflate semantic plausibility with logical validity. The competition featured four subtasks, covering English and multilingual settings with both standard and noisy premise sets. Evaluations of zero-shot baselines reveal that the content effect is pervasive in open models, whereas newer versions demonstrate a significant shift in performance. Across the subtasks, findings indicate that introducing distracting premises can challenge the models' ability to filter misleading information, while multilingual settings amplify their susceptibility to content biases compared to English. Participants proposed diverse approaches, including neuro-symbolic decomposition, fine-tuning and distillation methods, data augmentation, and activation steering. While explicit symbolic verification remains the most reliable strategy, activation-level interventions and fine-tuning methods offer promising pathways for internalising formal logic within neural architectures. Ultimately, the task reinforces the efficacy of neuro-symbolic approaches and emerging architectural trends for logical reliability, while also highlighting that multilingual setups and longer contexts still pose significant challenges to be investigated in future research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="valentino-etal-2026-semeval">
<titleInfo>
<title>SemEval-2026 Task 11: Disentangling Content and Formal Reasoning in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Valentino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Ranaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Pucci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Ranaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>SemEval-2026 Task 11 evaluates the ability of Large Language Models (LLMs) to perform content-independent reasoning through a novel multilingual syllogistic dataset designed to measure the “content effect” — the tendency to conflate semantic plausibility with logical validity. The competition featured four subtasks, covering English and multilingual settings with both standard and noisy premise sets. Evaluations of zero-shot baselines reveal that the content effect is pervasive in open models, whereas newer versions demonstrate a significant shift in performance. Across the subtasks, findings indicate that introducing distracting premises can challenge the models’ ability to filter misleading information, while multilingual settings amplify their susceptibility to content biases compared to English. Participants proposed diverse approaches, including neuro-symbolic decomposition, fine-tuning and distillation methods, data augmentation, and activation steering. While explicit symbolic verification remains the most reliable strategy, activation-level interventions and fine-tuning methods offer promising pathways for internalising formal logic within neural architectures. Ultimately, the task reinforces the efficacy of neuro-symbolic approaches and emerging architectural trends for logical reliability, while also highlighting that multilingual setups and longer contexts still pose significant challenges to be investigated in future research.</abstract>
<identifier type="citekey">valentino-etal-2026-semeval</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.450/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>3716</start>
<end>3730</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SemEval-2026 Task 11: Disentangling Content and Formal Reasoning in Large Language Models
%A Valentino, Marco
%A Ranaldi, Leonardo
%A Pucci, Giulia
%A Ranaldi, Federico
%A Freitas, André
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F valentino-etal-2026-semeval
%X SemEval-2026 Task 11 evaluates the ability of Large Language Models (LLMs) to perform content-independent reasoning through a novel multilingual syllogistic dataset designed to measure the “content effect” — the tendency to conflate semantic plausibility with logical validity. The competition featured four subtasks, covering English and multilingual settings with both standard and noisy premise sets. Evaluations of zero-shot baselines reveal that the content effect is pervasive in open models, whereas newer versions demonstrate a significant shift in performance. Across the subtasks, findings indicate that introducing distracting premises can challenge the models’ ability to filter misleading information, while multilingual settings amplify their susceptibility to content biases compared to English. Participants proposed diverse approaches, including neuro-symbolic decomposition, fine-tuning and distillation methods, data augmentation, and activation steering. While explicit symbolic verification remains the most reliable strategy, activation-level interventions and fine-tuning methods offer promising pathways for internalising formal logic within neural architectures. Ultimately, the task reinforces the efficacy of neuro-symbolic approaches and emerging architectural trends for logical reliability, while also highlighting that multilingual setups and longer contexts still pose significant challenges to be investigated in future research.
%U https://aclanthology.org/2026.semeval-1.450/
%P 3716-3730
Markdown (Informal)
[SemEval-2026 Task 11: Disentangling Content and Formal Reasoning in Large Language Models](https://aclanthology.org/2026.semeval-1.450/) (Valentino et al., SemEval 2026)
ACL