@inproceedings{zhang-etal-2026-towards-radiologist,
title = "Towards a Radiologist Imitation Framework for 3{D} {CT} Diagnosis with Multimodal {LLM}s",
author = "Zhang, Kaidi and
Yan, Zhiyuan and
Cheng, Gao and
Cai, Zhenyang",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.85/",
pages = "1056--1065",
ISBN = "979-8-89176-434-7",
abstract = "Three-dimensional Computed Tomography (3D CT) is a cornerstone of precision medicine. Most AI diagnostic models analyze large num bers of CTslices uniformly, treating all slices as equally important. While this has partly accel erated radiologists{'}workflows, it overlooks that clinically relevant information is often sparsely distributed throughout a volume. Without tar geted or weighted processing, fine-grained cues may be missed and substantial computation wasted on diagnostically uninformative slices. Wepropose aradiologist-simulating framework for selective and efficient 3D CT interpreta tion. Evaluated on a 3D CT dataset covering eight thoracic lesion types, it was compared with state-of-the-art multimodal large language models such as GPT-4o and supervised visual backbones including ViT and ResNet-50. Us ing accuracy, F1-score, AUC, and blind radiolo gist assessment, Screen-CLIP achieved an AUC of 0.87 and F1-score of 0.82, surpassing ViT Base (AUC: 0.84). For report generation, our method outperformed M3D across all metrics, reaching a BLEU-Avg of 29.03, and achieved the highest average Doctors' Score (6.16/10) in a preliminary human evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-towards-radiologist">
<titleInfo>
<title>Towards a Radiologist Imitation Framework for 3D CT Diagnosis with Multimodal LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kaidi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gao</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenyang</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Three-dimensional Computed Tomography (3D CT) is a cornerstone of precision medicine. Most AI diagnostic models analyze large num bers of CTslices uniformly, treating all slices as equally important. While this has partly accel erated radiologists’workflows, it overlooks that clinically relevant information is often sparsely distributed throughout a volume. Without tar geted or weighted processing, fine-grained cues may be missed and substantial computation wasted on diagnostically uninformative slices. Wepropose aradiologist-simulating framework for selective and efficient 3D CT interpreta tion. Evaluated on a 3D CT dataset covering eight thoracic lesion types, it was compared with state-of-the-art multimodal large language models such as GPT-4o and supervised visual backbones including ViT and ResNet-50. Us ing accuracy, F1-score, AUC, and blind radiolo gist assessment, Screen-CLIP achieved an AUC of 0.87 and F1-score of 0.82, surpassing ViT Base (AUC: 0.84). For report generation, our method outperformed M3D across all metrics, reaching a BLEU-Avg of 29.03, and achieved the highest average Doctors’ Score (6.16/10) in a preliminary human evaluation.</abstract>
<identifier type="citekey">zhang-etal-2026-towards-radiologist</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.85/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1056</start>
<end>1065</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards a Radiologist Imitation Framework for 3D CT Diagnosis with Multimodal LLMs
%A Zhang, Kaidi
%A Yan, Zhiyuan
%A Cheng, Gao
%A Cai, Zhenyang
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F zhang-etal-2026-towards-radiologist
%X Three-dimensional Computed Tomography (3D CT) is a cornerstone of precision medicine. Most AI diagnostic models analyze large num bers of CTslices uniformly, treating all slices as equally important. While this has partly accel erated radiologists’workflows, it overlooks that clinically relevant information is often sparsely distributed throughout a volume. Without tar geted or weighted processing, fine-grained cues may be missed and substantial computation wasted on diagnostically uninformative slices. Wepropose aradiologist-simulating framework for selective and efficient 3D CT interpreta tion. Evaluated on a 3D CT dataset covering eight thoracic lesion types, it was compared with state-of-the-art multimodal large language models such as GPT-4o and supervised visual backbones including ViT and ResNet-50. Us ing accuracy, F1-score, AUC, and blind radiolo gist assessment, Screen-CLIP achieved an AUC of 0.87 and F1-score of 0.82, surpassing ViT Base (AUC: 0.84). For report generation, our method outperformed M3D across all metrics, reaching a BLEU-Avg of 29.03, and achieved the highest average Doctors’ Score (6.16/10) in a preliminary human evaluation.
%U https://aclanthology.org/2026.bionlp-1.85/
%P 1056-1065
Markdown (Informal)
[Towards a Radiologist Imitation Framework for 3D CT Diagnosis with Multimodal LLMs](https://aclanthology.org/2026.bionlp-1.85/) (Zhang et al., BioNLP 2026)
ACL