@inproceedings{jullien-etal-2026-dissecting,
title = "Dissecting Clinical Reasoning in Natural Language Inference for Large Language Models",
author = "Jullien, Mael and
Freitas, Andre and
Valentino, Marco and
Ranaldi, Leonardo",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1307/",
doi = "10.18653/v1/2026.findings-acl.1307",
pages = "26229--26250",
ISBN = "979-8-89176-395-1",
abstract = "Recent works on large language models (LLMs) have demonstrated the impact of prompting strategies and fine-tuning techniques on their reasoning capabilities. Yet, their effectiveness on clinical natural language inference (CTNLI) remains underexplored. This study presents the first controlled evaluation of how prompt structure and efficient fine-tuning jointly shape model performance in CTNLI.We inspect four classes of prompting strategies to elicit reasoning in LLMs at different levels of abstraction, and evaluate their impact on a range of clinically motivated reasoning types. For each prompting strategy, we construct high-quality demonstrations using a frontier model to distil multi-step reasoning capabilities into smaller models ({\ensuremath{\leq}} 4B parameters) via Low-Rank Adaptation (LoRA). Across different LLMs fine-tuned on the NLI4CT benchmark, we found that prompt type alone accounts for up to 44{\%} of the variance in macro-F1. Moreover, LoRA fine-tuning yields consistent gains of +8 to 12 F1, raises output alignment above 97{\%}, and narrows the performance gap to GPT-4o-mini to within 7.1{\%}. Additional experiments on reasoning generalisation reveal that LoRA improves performance in 75{\%} of the models on MedNLI and TREC Clinical Trials.Overall, these findings demonstrate that (i) prompt structure is a primary driver of clinical NLI reasoning performance, (ii) compact models equipped with strong prompts and LoRA can rival frontier-scale systems, and (iii) reasoning-type-aware evaluation is essential to uncover prompt-induced trade-offs. Our results highlight the promise of combining prompt design and lightweight adaptation for more efficient and trustworthy clinical NLP systems, providing insights on the strengths and limitations of widely adopted prompting and parameter-efficient techniques in specialised domains. All code, annotations, prompts, demonstrations, and checkpoints will be released upon publication."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jullien-etal-2026-dissecting">
<titleInfo>
<title>Dissecting Clinical Reasoning in Natural Language Inference for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mael</namePart>
<namePart type="family">Jullien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Valentino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Ranaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recent works on large language models (LLMs) have demonstrated the impact of prompting strategies and fine-tuning techniques on their reasoning capabilities. Yet, their effectiveness on clinical natural language inference (CTNLI) remains underexplored. This study presents the first controlled evaluation of how prompt structure and efficient fine-tuning jointly shape model performance in CTNLI.We inspect four classes of prompting strategies to elicit reasoning in LLMs at different levels of abstraction, and evaluate their impact on a range of clinically motivated reasoning types. For each prompting strategy, we construct high-quality demonstrations using a frontier model to distil multi-step reasoning capabilities into smaller models (\ensuremathłeq 4B parameters) via Low-Rank Adaptation (LoRA). Across different LLMs fine-tuned on the NLI4CT benchmark, we found that prompt type alone accounts for up to 44% of the variance in macro-F1. Moreover, LoRA fine-tuning yields consistent gains of +8 to 12 F1, raises output alignment above 97%, and narrows the performance gap to GPT-4o-mini to within 7.1%. Additional experiments on reasoning generalisation reveal that LoRA improves performance in 75% of the models on MedNLI and TREC Clinical Trials.Overall, these findings demonstrate that (i) prompt structure is a primary driver of clinical NLI reasoning performance, (ii) compact models equipped with strong prompts and LoRA can rival frontier-scale systems, and (iii) reasoning-type-aware evaluation is essential to uncover prompt-induced trade-offs. Our results highlight the promise of combining prompt design and lightweight adaptation for more efficient and trustworthy clinical NLP systems, providing insights on the strengths and limitations of widely adopted prompting and parameter-efficient techniques in specialised domains. All code, annotations, prompts, demonstrations, and checkpoints will be released upon publication.</abstract>
<identifier type="citekey">jullien-etal-2026-dissecting</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.1307</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1307/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>26229</start>
<end>26250</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dissecting Clinical Reasoning in Natural Language Inference for Large Language Models
%A Jullien, Mael
%A Freitas, Andre
%A Valentino, Marco
%A Ranaldi, Leonardo
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F jullien-etal-2026-dissecting
%X Recent works on large language models (LLMs) have demonstrated the impact of prompting strategies and fine-tuning techniques on their reasoning capabilities. Yet, their effectiveness on clinical natural language inference (CTNLI) remains underexplored. This study presents the first controlled evaluation of how prompt structure and efficient fine-tuning jointly shape model performance in CTNLI.We inspect four classes of prompting strategies to elicit reasoning in LLMs at different levels of abstraction, and evaluate their impact on a range of clinically motivated reasoning types. For each prompting strategy, we construct high-quality demonstrations using a frontier model to distil multi-step reasoning capabilities into smaller models (\ensuremathłeq 4B parameters) via Low-Rank Adaptation (LoRA). Across different LLMs fine-tuned on the NLI4CT benchmark, we found that prompt type alone accounts for up to 44% of the variance in macro-F1. Moreover, LoRA fine-tuning yields consistent gains of +8 to 12 F1, raises output alignment above 97%, and narrows the performance gap to GPT-4o-mini to within 7.1%. Additional experiments on reasoning generalisation reveal that LoRA improves performance in 75% of the models on MedNLI and TREC Clinical Trials.Overall, these findings demonstrate that (i) prompt structure is a primary driver of clinical NLI reasoning performance, (ii) compact models equipped with strong prompts and LoRA can rival frontier-scale systems, and (iii) reasoning-type-aware evaluation is essential to uncover prompt-induced trade-offs. Our results highlight the promise of combining prompt design and lightweight adaptation for more efficient and trustworthy clinical NLP systems, providing insights on the strengths and limitations of widely adopted prompting and parameter-efficient techniques in specialised domains. All code, annotations, prompts, demonstrations, and checkpoints will be released upon publication.
%R 10.18653/v1/2026.findings-acl.1307
%U https://aclanthology.org/2026.findings-acl.1307/
%U https://doi.org/10.18653/v1/2026.findings-acl.1307
%P 26229-26250
Markdown (Informal)
[Dissecting Clinical Reasoning in Natural Language Inference for Large Language Models](https://aclanthology.org/2026.findings-acl.1307/) (Jullien et al., Findings 2026)
ACL