@inproceedings{liang-etal-2026-tubingen,
title = {{T}{\"u}bingen-{CL} at {S}em{E}val-2026 Task 12: Reinforcement Learning and Verification for Abductive Reasoning},
author = "Liang, Bolun and
Khudaybergenova, Ayperi and
Kankanamge, Shashikala",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.330/",
pages = "2621--2629",
ISBN = "979-8-89176-414-9",
abstract = "We investigate the reliability of verifier-based pipelines for abductive reasoning in SemEval-2026 Task 12. While reinforcement learning improves the base generator{'}s performance, we find that incorporating a small-model verifier introduces a significant generalization gap: although effective on validation data, the verifier systematically degrades correct predictions on the unseen test set by appending false positives. Furthermore, we reveal a critical vulnerability in the official evaluation metric, which assigns zero reward to abstentions but does not sufficiently penalize incorrect selections. This asymmetry enables trivial heuristic strategies such as blindly selecting a default option to substantially inflate performance, even outperforming more principled reasoning systems. Our analysis demonstrates that current evaluation protocols can misrepresent true reasoning ability and highlights the need for more robust verification methods and scoring schemes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liang-etal-2026-tubingen">
<titleInfo>
<title>Tübingen-CL at SemEval-2026 Task 12: Reinforcement Learning and Verification for Abductive Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bolun</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayperi</namePart>
<namePart type="family">Khudaybergenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashikala</namePart>
<namePart type="family">Kankanamge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>We investigate the reliability of verifier-based pipelines for abductive reasoning in SemEval-2026 Task 12. While reinforcement learning improves the base generator’s performance, we find that incorporating a small-model verifier introduces a significant generalization gap: although effective on validation data, the verifier systematically degrades correct predictions on the unseen test set by appending false positives. Furthermore, we reveal a critical vulnerability in the official evaluation metric, which assigns zero reward to abstentions but does not sufficiently penalize incorrect selections. This asymmetry enables trivial heuristic strategies such as blindly selecting a default option to substantially inflate performance, even outperforming more principled reasoning systems. Our analysis demonstrates that current evaluation protocols can misrepresent true reasoning ability and highlights the need for more robust verification methods and scoring schemes.</abstract>
<identifier type="citekey">liang-etal-2026-tubingen</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.330/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>2621</start>
<end>2629</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tübingen-CL at SemEval-2026 Task 12: Reinforcement Learning and Verification for Abductive Reasoning
%A Liang, Bolun
%A Khudaybergenova, Ayperi
%A Kankanamge, Shashikala
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F liang-etal-2026-tubingen
%X We investigate the reliability of verifier-based pipelines for abductive reasoning in SemEval-2026 Task 12. While reinforcement learning improves the base generator’s performance, we find that incorporating a small-model verifier introduces a significant generalization gap: although effective on validation data, the verifier systematically degrades correct predictions on the unseen test set by appending false positives. Furthermore, we reveal a critical vulnerability in the official evaluation metric, which assigns zero reward to abstentions but does not sufficiently penalize incorrect selections. This asymmetry enables trivial heuristic strategies such as blindly selecting a default option to substantially inflate performance, even outperforming more principled reasoning systems. Our analysis demonstrates that current evaluation protocols can misrepresent true reasoning ability and highlights the need for more robust verification methods and scoring schemes.
%U https://aclanthology.org/2026.semeval-1.330/
%P 2621-2629
Markdown (Informal)
[Tübingen-CL at SemEval-2026 Task 12: Reinforcement Learning and Verification for Abductive Reasoning](https://aclanthology.org/2026.semeval-1.330/) (Liang et al., SemEval 2026)
ACL