@inproceedings{liu-etal-2026-dialogue,
title = "Dialogue is Better Than Monologue: Instructing Meidcal {LLM}s via Strategic Conversations",
author = "Liu, Zijie and
Zhao, Xinyu and
Peng, Jie and
Duan, Jinhao and
Zhu, Zhuangdi and
Chen, Qingyu and
Xu, Kaidi and
Hu, Xia and
Chen, Tianlong",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.149/",
doi = "10.18653/v1/2026.findings-eacl.149",
pages = "2858--2872",
ISBN = "979-8-89176-386-9",
abstract = "In real clinical practice, clinicians must sift through noisy and often conflicting information, progressively gathering and sequencing evidence before reaching conclusions. However, existing tuning methods for medical AI models are typically \textbf{monologue-based} {---} that is, models are fine-tuned on static question answering (QA) tasks or medical articles, which fail to reflect the interactive and iterative nature of clinical reasoning. To bridge this gap, we introduce \textbf{MuddyMaze}, a benchmark designed to expose the limitations of current \textbf{monologue-based} tuning, and construct a large dialogue dataset of 22.2k doctor{--}patient interactions that capture stepwise diagnostic reasoning validated by medical experts. Building on those, we propose \textbf{dialogue-tuning}, a new fine-tuning paradigm that captures the internal reasoning dynamics unfolding across interactions.To assess the effectiveness of our approach, we evaluated \textit{dialogue-tuned} models on \textbf{MuddyMaze}, where they outperform \textit{monologue-tuned} baselines (e.g., MedQA) by +16.1{\%} in one-round and +4.1{\%} in multi-round evidence ranking, while maintaining or even improving accuracy on standard medical QA benchmarks (e.g., PubMedQA). These results indicate that \textbf{dialogue-tuning} not only enhances reasoning robustness and evidence integration but also preserves the factual precision of traditional QA performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-dialogue">
<titleInfo>
<title>Dialogue is Better Than Monologue: Instructing Meidcal LLMs via Strategic Conversations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zijie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinhao</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuangdi</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaidi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xia</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianlong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>In real clinical practice, clinicians must sift through noisy and often conflicting information, progressively gathering and sequencing evidence before reaching conclusions. However, existing tuning methods for medical AI models are typically monologue-based — that is, models are fine-tuned on static question answering (QA) tasks or medical articles, which fail to reflect the interactive and iterative nature of clinical reasoning. To bridge this gap, we introduce MuddyMaze, a benchmark designed to expose the limitations of current monologue-based tuning, and construct a large dialogue dataset of 22.2k doctor–patient interactions that capture stepwise diagnostic reasoning validated by medical experts. Building on those, we propose dialogue-tuning, a new fine-tuning paradigm that captures the internal reasoning dynamics unfolding across interactions.To assess the effectiveness of our approach, we evaluated dialogue-tuned models on MuddyMaze, where they outperform monologue-tuned baselines (e.g., MedQA) by +16.1% in one-round and +4.1% in multi-round evidence ranking, while maintaining or even improving accuracy on standard medical QA benchmarks (e.g., PubMedQA). These results indicate that dialogue-tuning not only enhances reasoning robustness and evidence integration but also preserves the factual precision of traditional QA performance.</abstract>
<identifier type="citekey">liu-etal-2026-dialogue</identifier>
<identifier type="doi">10.18653/v1/2026.findings-eacl.149</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.149/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2858</start>
<end>2872</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dialogue is Better Than Monologue: Instructing Meidcal LLMs via Strategic Conversations
%A Liu, Zijie
%A Zhao, Xinyu
%A Peng, Jie
%A Duan, Jinhao
%A Zhu, Zhuangdi
%A Chen, Qingyu
%A Xu, Kaidi
%A Hu, Xia
%A Chen, Tianlong
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F liu-etal-2026-dialogue
%X In real clinical practice, clinicians must sift through noisy and often conflicting information, progressively gathering and sequencing evidence before reaching conclusions. However, existing tuning methods for medical AI models are typically monologue-based — that is, models are fine-tuned on static question answering (QA) tasks or medical articles, which fail to reflect the interactive and iterative nature of clinical reasoning. To bridge this gap, we introduce MuddyMaze, a benchmark designed to expose the limitations of current monologue-based tuning, and construct a large dialogue dataset of 22.2k doctor–patient interactions that capture stepwise diagnostic reasoning validated by medical experts. Building on those, we propose dialogue-tuning, a new fine-tuning paradigm that captures the internal reasoning dynamics unfolding across interactions.To assess the effectiveness of our approach, we evaluated dialogue-tuned models on MuddyMaze, where they outperform monologue-tuned baselines (e.g., MedQA) by +16.1% in one-round and +4.1% in multi-round evidence ranking, while maintaining or even improving accuracy on standard medical QA benchmarks (e.g., PubMedQA). These results indicate that dialogue-tuning not only enhances reasoning robustness and evidence integration but also preserves the factual precision of traditional QA performance.
%R 10.18653/v1/2026.findings-eacl.149
%U https://aclanthology.org/2026.findings-eacl.149/
%U https://doi.org/10.18653/v1/2026.findings-eacl.149
%P 2858-2872
Markdown (Informal)
[Dialogue is Better Than Monologue: Instructing Meidcal LLMs via Strategic Conversations](https://aclanthology.org/2026.findings-eacl.149/) (Liu et al., Findings 2026)
ACL
- Zijie Liu, Xinyu Zhao, Jie Peng, Jinhao Duan, Zhuangdi Zhu, Qingyu Chen, Kaidi Xu, Xia Hu, and Tianlong Chen. 2026. Dialogue is Better Than Monologue: Instructing Meidcal LLMs via Strategic Conversations. In Findings of the Association for Computational Linguistics: EACL 2026, pages 2858–2872, Rabat, Morocco. Association for Computational Linguistics.