@inproceedings{liu-etal-2026-inslogicbench,
title = "{I}ns{L}ogic{B}ench: An Argumentation Logic Grounded Benchmark for Complex Insurance Claims Adjudication",
author = "Liu, Jin and
Liu, Yunpeng and
Wang, Keyi and
Shi, Jie and
Xu, Xiao and
Huang, Wenkang and
Xu, Xingzhong and
Liang, Xin and
Xiao, Yanghua",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1035/",
pages = "22592--22619",
ISBN = "979-8-89176-390-6",
abstract = "Insurance claims adjudication demands not only accurate decisions but also interpretable reasoning grounded in policy clauses. However, existing benchmarks are limited to information retrieval or simple multiple-choice setups, which fail to require step-by-step inferences from facts to conclusions. To address this gap, we introduce InsLogicBench, a benchmark providing complete reasoning traces that link factual inputs, relevant policy clauses, and final verdicts. We construct the dataset using a controllable synthesis framework based on the Nested Toulmin Model. By capturing the defeasible logic of insurance policies through hierarchical truth assignment and enforcing validity via consistency verification, we ensure interpretability and logical rigor across generated examples. We evaluate eight Large Language Models (LLMs) on InsLogicBench. Results show significant difficulties in handling exception clauses and verifying missing conditions. Notably, models often produce correct final decisions but fail to provide precise justifications, highlighting a critical discrepancy between their decision accuracy and logical reasoning capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-inslogicbench">
<titleInfo>
<title>InsLogicBench: An Argumentation Logic Grounded Benchmark for Complex Insurance Claims Adjudication</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunpeng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keyi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenkang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingzhong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanghua</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Insurance claims adjudication demands not only accurate decisions but also interpretable reasoning grounded in policy clauses. However, existing benchmarks are limited to information retrieval or simple multiple-choice setups, which fail to require step-by-step inferences from facts to conclusions. To address this gap, we introduce InsLogicBench, a benchmark providing complete reasoning traces that link factual inputs, relevant policy clauses, and final verdicts. We construct the dataset using a controllable synthesis framework based on the Nested Toulmin Model. By capturing the defeasible logic of insurance policies through hierarchical truth assignment and enforcing validity via consistency verification, we ensure interpretability and logical rigor across generated examples. We evaluate eight Large Language Models (LLMs) on InsLogicBench. Results show significant difficulties in handling exception clauses and verifying missing conditions. Notably, models often produce correct final decisions but fail to provide precise justifications, highlighting a critical discrepancy between their decision accuracy and logical reasoning capabilities.</abstract>
<identifier type="citekey">liu-etal-2026-inslogicbench</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1035/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>22592</start>
<end>22619</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T InsLogicBench: An Argumentation Logic Grounded Benchmark for Complex Insurance Claims Adjudication
%A Liu, Jin
%A Liu, Yunpeng
%A Wang, Keyi
%A Shi, Jie
%A Xu, Xiao
%A Huang, Wenkang
%A Xu, Xingzhong
%A Liang, Xin
%A Xiao, Yanghua
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F liu-etal-2026-inslogicbench
%X Insurance claims adjudication demands not only accurate decisions but also interpretable reasoning grounded in policy clauses. However, existing benchmarks are limited to information retrieval or simple multiple-choice setups, which fail to require step-by-step inferences from facts to conclusions. To address this gap, we introduce InsLogicBench, a benchmark providing complete reasoning traces that link factual inputs, relevant policy clauses, and final verdicts. We construct the dataset using a controllable synthesis framework based on the Nested Toulmin Model. By capturing the defeasible logic of insurance policies through hierarchical truth assignment and enforcing validity via consistency verification, we ensure interpretability and logical rigor across generated examples. We evaluate eight Large Language Models (LLMs) on InsLogicBench. Results show significant difficulties in handling exception clauses and verifying missing conditions. Notably, models often produce correct final decisions but fail to provide precise justifications, highlighting a critical discrepancy between their decision accuracy and logical reasoning capabilities.
%U https://aclanthology.org/2026.acl-long.1035/
%P 22592-22619
Markdown (Informal)
[InsLogicBench: An Argumentation Logic Grounded Benchmark for Complex Insurance Claims Adjudication](https://aclanthology.org/2026.acl-long.1035/) (Liu et al., ACL 2026)
ACL
- Jin Liu, Yunpeng Liu, Keyi Wang, Jie Shi, Xiao Xu, Wenkang Huang, Xingzhong Xu, Xin Liang, and Yanghua Xiao. 2026. InsLogicBench: An Argumentation Logic Grounded Benchmark for Complex Insurance Claims Adjudication. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 22592–22619, San Diego, California, United States. Association for Computational Linguistics.