@inproceedings{hanken-2026-agentic,
title = "Agentic {AI} Architectures for {SOAP} Note Generation",
author = "Hanken, Keno",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.61/",
pages = "742--752",
ISBN = "979-8-89176-434-7",
abstract = "Clinical documentation places significant time demands on medical professionals, consumes institutional resources, and is prone to errors that may compromise patient care. Recent advances in LLMs offer promising approaches for automating clinical note generation; however, the impact of different AI architectural designs remains underexplored, particularly for agentic AI systems. This study compares three architectures ? single-LLM, multi-agentic, and swarm-agentic ? for automated SOAP (Subjective, Objective, Assessment, Plan) note generation from doctor?patient dialogues. All approaches employ QLoRA-finetuned Ministral 3 models (3B and 8B parameters) trained on the MedSynth dataset, comprising 10,030 dialogue?note pairs across 2,006 ICD-10 code classes. Performance is evaluated using ROUGE-1, ROUGE-2, ROUGE-L, and BERTScore against a lexical-overlap baseline (dialogue vs. ground-truth SOAP, no inference). Results show that all finetuned models substantially outperform the baseline, while differences between architectural variants remain marginal. The single-LLM setup achieves the strongest performance across all metrics; 3B and 8B variants perform nearly identically on semantic similarity (BERTScore), while ROUGE differences are small but statistically significant. Qualitative inspection further reveals that residual differences across architectures are driven primarily by shared dataset priors rather than by architectural reasoning capacity. The results are based on synthetic data without human evaluation and reflect architectural behavior only."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hanken-2026-agentic">
<titleInfo>
<title>Agentic AI Architectures for SOAP Note Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Keno</namePart>
<namePart type="family">Hanken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Clinical documentation places significant time demands on medical professionals, consumes institutional resources, and is prone to errors that may compromise patient care. Recent advances in LLMs offer promising approaches for automating clinical note generation; however, the impact of different AI architectural designs remains underexplored, particularly for agentic AI systems. This study compares three architectures ? single-LLM, multi-agentic, and swarm-agentic ? for automated SOAP (Subjective, Objective, Assessment, Plan) note generation from doctor?patient dialogues. All approaches employ QLoRA-finetuned Ministral 3 models (3B and 8B parameters) trained on the MedSynth dataset, comprising 10,030 dialogue?note pairs across 2,006 ICD-10 code classes. Performance is evaluated using ROUGE-1, ROUGE-2, ROUGE-L, and BERTScore against a lexical-overlap baseline (dialogue vs. ground-truth SOAP, no inference). Results show that all finetuned models substantially outperform the baseline, while differences between architectural variants remain marginal. The single-LLM setup achieves the strongest performance across all metrics; 3B and 8B variants perform nearly identically on semantic similarity (BERTScore), while ROUGE differences are small but statistically significant. Qualitative inspection further reveals that residual differences across architectures are driven primarily by shared dataset priors rather than by architectural reasoning capacity. The results are based on synthetic data without human evaluation and reflect architectural behavior only.</abstract>
<identifier type="citekey">hanken-2026-agentic</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.61/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>742</start>
<end>752</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Agentic AI Architectures for SOAP Note Generation
%A Hanken, Keno
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F hanken-2026-agentic
%X Clinical documentation places significant time demands on medical professionals, consumes institutional resources, and is prone to errors that may compromise patient care. Recent advances in LLMs offer promising approaches for automating clinical note generation; however, the impact of different AI architectural designs remains underexplored, particularly for agentic AI systems. This study compares three architectures ? single-LLM, multi-agentic, and swarm-agentic ? for automated SOAP (Subjective, Objective, Assessment, Plan) note generation from doctor?patient dialogues. All approaches employ QLoRA-finetuned Ministral 3 models (3B and 8B parameters) trained on the MedSynth dataset, comprising 10,030 dialogue?note pairs across 2,006 ICD-10 code classes. Performance is evaluated using ROUGE-1, ROUGE-2, ROUGE-L, and BERTScore against a lexical-overlap baseline (dialogue vs. ground-truth SOAP, no inference). Results show that all finetuned models substantially outperform the baseline, while differences between architectural variants remain marginal. The single-LLM setup achieves the strongest performance across all metrics; 3B and 8B variants perform nearly identically on semantic similarity (BERTScore), while ROUGE differences are small but statistically significant. Qualitative inspection further reveals that residual differences across architectures are driven primarily by shared dataset priors rather than by architectural reasoning capacity. The results are based on synthetic data without human evaluation and reflect architectural behavior only.
%U https://aclanthology.org/2026.bionlp-1.61/
%P 742-752
Markdown (Informal)
[Agentic AI Architectures for SOAP Note Generation](https://aclanthology.org/2026.bionlp-1.61/) (Hanken, BioNLP 2026)
ACL