@inproceedings{jiang-etal-2025-gpt,
title = "{GPT}-4{V} Cannot Generate Radiology Reports Yet",
author = "Jiang, Yuyang and
Chen, Chacha and
Nguyen, Dang and
Mervak, Benjamin M. and
Tan, Chenhao",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.113/",
doi = "10.18653/v1/2025.findings-naacl.113",
pages = "2127--2154",
ISBN = "979-8-89176-195-7",
abstract = "GPT-4{'}s purported strong multimodal abilities raise interests in using it to automate radiology report writing, but there lacks thorough evaluations. In this work, we perform a systematic evaluation of GPT-4 (4o and vision-preview) in generating radiology reports across three chest X-ray report benchmarks: MIMIC-CXR, CheXpert Plus, and IU X-Ray. We attempt to directly generate reports with different prompting strategies and find that the models fail terribly in both lexical metrics and clinical efficacy metrics. To understand the low performance, we decompose the task into two steps: 1) the **medical image reasoning** step of predicting medical condition labels from images; and 2) the **report synthesis** step of generating reports from (groundtruth) conditions. We show that GPT-4{'}s performance in image reasoning is consistently low across different prompts. In fact, the distributions of model-predicted labels remain constant regardless of which groundtruth conditions are present on the image, suggesting that the model is not interpreting chest X-rays meaningfully. Even when given groundtruth conditions in report synthesis, its generated reports are less correct and less natural-sounding than a finetuned Llama. Altogether, our findings cast doubt on the viability of using GPT-4 in a radiology workflow."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jiang-etal-2025-gpt">
<titleInfo>
<title>GPT-4V Cannot Generate Radiology Reports Yet</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuyang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chacha</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dang</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mervak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhao</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>GPT-4’s purported strong multimodal abilities raise interests in using it to automate radiology report writing, but there lacks thorough evaluations. In this work, we perform a systematic evaluation of GPT-4 (4o and vision-preview) in generating radiology reports across three chest X-ray report benchmarks: MIMIC-CXR, CheXpert Plus, and IU X-Ray. We attempt to directly generate reports with different prompting strategies and find that the models fail terribly in both lexical metrics and clinical efficacy metrics. To understand the low performance, we decompose the task into two steps: 1) the **medical image reasoning** step of predicting medical condition labels from images; and 2) the **report synthesis** step of generating reports from (groundtruth) conditions. We show that GPT-4’s performance in image reasoning is consistently low across different prompts. In fact, the distributions of model-predicted labels remain constant regardless of which groundtruth conditions are present on the image, suggesting that the model is not interpreting chest X-rays meaningfully. Even when given groundtruth conditions in report synthesis, its generated reports are less correct and less natural-sounding than a finetuned Llama. Altogether, our findings cast doubt on the viability of using GPT-4 in a radiology workflow.</abstract>
<identifier type="citekey">jiang-etal-2025-gpt</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.113</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.113/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>2127</start>
<end>2154</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GPT-4V Cannot Generate Radiology Reports Yet
%A Jiang, Yuyang
%A Chen, Chacha
%A Nguyen, Dang
%A Mervak, Benjamin M.
%A Tan, Chenhao
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F jiang-etal-2025-gpt
%X GPT-4’s purported strong multimodal abilities raise interests in using it to automate radiology report writing, but there lacks thorough evaluations. In this work, we perform a systematic evaluation of GPT-4 (4o and vision-preview) in generating radiology reports across three chest X-ray report benchmarks: MIMIC-CXR, CheXpert Plus, and IU X-Ray. We attempt to directly generate reports with different prompting strategies and find that the models fail terribly in both lexical metrics and clinical efficacy metrics. To understand the low performance, we decompose the task into two steps: 1) the **medical image reasoning** step of predicting medical condition labels from images; and 2) the **report synthesis** step of generating reports from (groundtruth) conditions. We show that GPT-4’s performance in image reasoning is consistently low across different prompts. In fact, the distributions of model-predicted labels remain constant regardless of which groundtruth conditions are present on the image, suggesting that the model is not interpreting chest X-rays meaningfully. Even when given groundtruth conditions in report synthesis, its generated reports are less correct and less natural-sounding than a finetuned Llama. Altogether, our findings cast doubt on the viability of using GPT-4 in a radiology workflow.
%R 10.18653/v1/2025.findings-naacl.113
%U https://aclanthology.org/2025.findings-naacl.113/
%U https://doi.org/10.18653/v1/2025.findings-naacl.113
%P 2127-2154
Markdown (Informal)
[GPT-4V Cannot Generate Radiology Reports Yet](https://aclanthology.org/2025.findings-naacl.113/) (Jiang et al., Findings 2025)
ACL
- Yuyang Jiang, Chacha Chen, Dang Nguyen, Benjamin M. Mervak, and Chenhao Tan. 2025. GPT-4V Cannot Generate Radiology Reports Yet. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 2127–2154, Albuquerque, New Mexico. Association for Computational Linguistics.