@inproceedings{merenda-etal-2026-llms,
title = "Can {LLM}s Reason Like Doctors? Exploring the Limits of Large Language Models in Complex Medical Reasoning",
author = "Merenda, Flavio and
Gomez-Perez, Jose Manuel and
Rigau, German",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.127/",
pages = "2432--2452",
ISBN = "979-8-89176-386-9",
abstract = "Large language models (LLMs) have shown remarkable progress in reasoning across multiple domains. However, it remains unclear whether their abilities reflect genuine reasoning or sophisticated pattern matching, a distinction critical in medical decision-making, where reliable multi-step problem-solving is required. Accordingly, we conduct one of the largest evaluations to date, assessing 77 LLMs with diverse fine-tuning approaches, ranging from 1 billion parameters to frontier models. Guided by medical problem-solving theory, we select three medical question answering (QA) benchmarks targeting key reasoning skills: reasoning processes, susceptibility to cognitive biases, and metacognitive abilities. Additionally, we manually annotate a subset of questions to assess the abduction, deduction, and induction capabilities of LLMs, offering detailed insight into the reasoning mechanisms followed by physicians, an aspect that has received relatively limited attention in this domain. Most models, particularly smaller ones, struggle even with specialized fine-tuning or advanced prompting. Larger models perform better but still show clear limitations in complex medical reasoning. Our findings highlight the need to improve specific reasoning strategies to better reflect medical decision-making. The datasets and code used in this study are publicly available at: \url{https://github.com/expertailab/Can-LLMs-Reason-Like-Doctors}"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="merenda-etal-2026-llms">
<titleInfo>
<title>Can LLMs Reason Like Doctors? Exploring the Limits of Large Language Models in Complex Medical Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Flavio</namePart>
<namePart type="family">Merenda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="given">Manuel</namePart>
<namePart type="family">Gomez-Perez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">German</namePart>
<namePart type="family">Rigau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have shown remarkable progress in reasoning across multiple domains. However, it remains unclear whether their abilities reflect genuine reasoning or sophisticated pattern matching, a distinction critical in medical decision-making, where reliable multi-step problem-solving is required. Accordingly, we conduct one of the largest evaluations to date, assessing 77 LLMs with diverse fine-tuning approaches, ranging from 1 billion parameters to frontier models. Guided by medical problem-solving theory, we select three medical question answering (QA) benchmarks targeting key reasoning skills: reasoning processes, susceptibility to cognitive biases, and metacognitive abilities. Additionally, we manually annotate a subset of questions to assess the abduction, deduction, and induction capabilities of LLMs, offering detailed insight into the reasoning mechanisms followed by physicians, an aspect that has received relatively limited attention in this domain. Most models, particularly smaller ones, struggle even with specialized fine-tuning or advanced prompting. Larger models perform better but still show clear limitations in complex medical reasoning. Our findings highlight the need to improve specific reasoning strategies to better reflect medical decision-making. The datasets and code used in this study are publicly available at: https://github.com/expertailab/Can-LLMs-Reason-Like-Doctors</abstract>
<identifier type="citekey">merenda-etal-2026-llms</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.127/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2432</start>
<end>2452</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can LLMs Reason Like Doctors? Exploring the Limits of Large Language Models in Complex Medical Reasoning
%A Merenda, Flavio
%A Gomez-Perez, Jose Manuel
%A Rigau, German
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F merenda-etal-2026-llms
%X Large language models (LLMs) have shown remarkable progress in reasoning across multiple domains. However, it remains unclear whether their abilities reflect genuine reasoning or sophisticated pattern matching, a distinction critical in medical decision-making, where reliable multi-step problem-solving is required. Accordingly, we conduct one of the largest evaluations to date, assessing 77 LLMs with diverse fine-tuning approaches, ranging from 1 billion parameters to frontier models. Guided by medical problem-solving theory, we select three medical question answering (QA) benchmarks targeting key reasoning skills: reasoning processes, susceptibility to cognitive biases, and metacognitive abilities. Additionally, we manually annotate a subset of questions to assess the abduction, deduction, and induction capabilities of LLMs, offering detailed insight into the reasoning mechanisms followed by physicians, an aspect that has received relatively limited attention in this domain. Most models, particularly smaller ones, struggle even with specialized fine-tuning or advanced prompting. Larger models perform better but still show clear limitations in complex medical reasoning. Our findings highlight the need to improve specific reasoning strategies to better reflect medical decision-making. The datasets and code used in this study are publicly available at: https://github.com/expertailab/Can-LLMs-Reason-Like-Doctors
%U https://aclanthology.org/2026.findings-eacl.127/
%P 2432-2452
Markdown (Informal)
[Can LLMs Reason Like Doctors? Exploring the Limits of Large Language Models in Complex Medical Reasoning](https://aclanthology.org/2026.findings-eacl.127/) (Merenda et al., Findings 2026)
ACL