@inproceedings{huang-etal-2025-cmeaa,
title = "{C}m{EAA}: Cross-modal Enhancement and Alignment Adapter for Radiology Report Generation",
author = "Huang, Xiyang and
Han, Yingjie and
L, Yx and
Li, Runzhi and
Wu, Pengcheng and
Zhang, Kunli",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.571/",
pages = "8546--8556",
abstract = "Automatic radiology report generation is pivotal in reducing the workload of radiologists, while simultaneously improving diagnostic accuracy and operational efficiency. Current methods face significant challenges, including the effective alignment of medical visual features with textual features and the mitigation of data bias. In this paper, we propose a method for radiology report generation that utilizes a Cross-modal Enhancement and Alignment Adapter (CmEAA) to connect a vision encoder with a frozen large language model. Specifically, we introduce two novel modules within CmEAA: Cross-modal Feature Enhancement (CFE) and Neural Mutual Information Aligner (NMIA). CFE extracts observation-related contextual features to enhance the visual features of lesions and abnormal regions in radiology images through a cross-modal enhancement transformer. NMIA maximizes neural mutual information between visual and textual representations within a low-dimensional alignment embedding space during training and provides potential global alignment visual representations during inference. Additionally, a weights generator is designed to enable the dynamic adaptation of cross-modal enhanced features and vanilla visual features. Experimental results on two prevailing datasets, namely, IU X-Ray and MIMIC-CXR, demonstrate that the proposed model outperforms previous state-of-the-art methods."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2025-cmeaa">
<titleInfo>
<title>CmEAA: Cross-modal Enhancement and Alignment Adapter for Radiology Report Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiyang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingjie</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yx</namePart>
<namePart type="family">L</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Runzhi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengcheng</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kunli</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic radiology report generation is pivotal in reducing the workload of radiologists, while simultaneously improving diagnostic accuracy and operational efficiency. Current methods face significant challenges, including the effective alignment of medical visual features with textual features and the mitigation of data bias. In this paper, we propose a method for radiology report generation that utilizes a Cross-modal Enhancement and Alignment Adapter (CmEAA) to connect a vision encoder with a frozen large language model. Specifically, we introduce two novel modules within CmEAA: Cross-modal Feature Enhancement (CFE) and Neural Mutual Information Aligner (NMIA). CFE extracts observation-related contextual features to enhance the visual features of lesions and abnormal regions in radiology images through a cross-modal enhancement transformer. NMIA maximizes neural mutual information between visual and textual representations within a low-dimensional alignment embedding space during training and provides potential global alignment visual representations during inference. Additionally, a weights generator is designed to enable the dynamic adaptation of cross-modal enhanced features and vanilla visual features. Experimental results on two prevailing datasets, namely, IU X-Ray and MIMIC-CXR, demonstrate that the proposed model outperforms previous state-of-the-art methods.</abstract>
<identifier type="citekey">huang-etal-2025-cmeaa</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.571/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>8546</start>
<end>8556</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CmEAA: Cross-modal Enhancement and Alignment Adapter for Radiology Report Generation
%A Huang, Xiyang
%A Han, Yingjie
%A L, Yx
%A Li, Runzhi
%A Wu, Pengcheng
%A Zhang, Kunli
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F huang-etal-2025-cmeaa
%X Automatic radiology report generation is pivotal in reducing the workload of radiologists, while simultaneously improving diagnostic accuracy and operational efficiency. Current methods face significant challenges, including the effective alignment of medical visual features with textual features and the mitigation of data bias. In this paper, we propose a method for radiology report generation that utilizes a Cross-modal Enhancement and Alignment Adapter (CmEAA) to connect a vision encoder with a frozen large language model. Specifically, we introduce two novel modules within CmEAA: Cross-modal Feature Enhancement (CFE) and Neural Mutual Information Aligner (NMIA). CFE extracts observation-related contextual features to enhance the visual features of lesions and abnormal regions in radiology images through a cross-modal enhancement transformer. NMIA maximizes neural mutual information between visual and textual representations within a low-dimensional alignment embedding space during training and provides potential global alignment visual representations during inference. Additionally, a weights generator is designed to enable the dynamic adaptation of cross-modal enhanced features and vanilla visual features. Experimental results on two prevailing datasets, namely, IU X-Ray and MIMIC-CXR, demonstrate that the proposed model outperforms previous state-of-the-art methods.
%U https://aclanthology.org/2025.coling-main.571/
%P 8546-8556
Markdown (Informal)
[CmEAA: Cross-modal Enhancement and Alignment Adapter for Radiology Report Generation](https://aclanthology.org/2025.coling-main.571/) (Huang et al., COLING 2025)
ACL