@inproceedings{yin-etal-2025-kia,
title = "{KIA}: Knowledge-Guided Implicit Vision-Language Alignment for Chest {X}-Ray Report Generation",
author = "Yin, Heng and
Zhou, Shanlin and
Wang, Pandong and
Wu, Zirui and
Hao, Yongtao",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.276/",
pages = "4096--4108",
abstract = "Report generation (RG) faces challenges in understanding complex medical images and establishing cross-modal semantic alignment in radiology image-report pairs. Previous methods often overlook fine-grained cross-modal interaction, leading to insufficient understanding of detailed information. Recently, various large multimodal models have been proposed for image-text tasks. However, such models still underperform on rare domain tasks like understanding complex medical images. To address these limitations, we develop a new framework of Knowledge-guided Implicit vision-language Alignment for radiology report generation, named KIA. To better understand medical reports and images and build alignment between them, multi-task implicit alignment is creatively introduced, forming comprehensive understanding of medical images and reports. Additionally, to further meet medical refinement requirements, we design novel masking strategies guided by medical knowledge to enhance pathological observation and anatomical landm"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2025-kia">
<titleInfo>
<title>KIA: Knowledge-Guided Implicit Vision-Language Alignment for Chest X-Ray Report Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shanlin</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pandong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zirui</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongtao</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Report generation (RG) faces challenges in understanding complex medical images and establishing cross-modal semantic alignment in radiology image-report pairs. Previous methods often overlook fine-grained cross-modal interaction, leading to insufficient understanding of detailed information. Recently, various large multimodal models have been proposed for image-text tasks. However, such models still underperform on rare domain tasks like understanding complex medical images. To address these limitations, we develop a new framework of Knowledge-guided Implicit vision-language Alignment for radiology report generation, named KIA. To better understand medical reports and images and build alignment between them, multi-task implicit alignment is creatively introduced, forming comprehensive understanding of medical images and reports. Additionally, to further meet medical refinement requirements, we design novel masking strategies guided by medical knowledge to enhance pathological observation and anatomical landm</abstract>
<identifier type="citekey">yin-etal-2025-kia</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.276/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>4096</start>
<end>4108</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T KIA: Knowledge-Guided Implicit Vision-Language Alignment for Chest X-Ray Report Generation
%A Yin, Heng
%A Zhou, Shanlin
%A Wang, Pandong
%A Wu, Zirui
%A Hao, Yongtao
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F yin-etal-2025-kia
%X Report generation (RG) faces challenges in understanding complex medical images and establishing cross-modal semantic alignment in radiology image-report pairs. Previous methods often overlook fine-grained cross-modal interaction, leading to insufficient understanding of detailed information. Recently, various large multimodal models have been proposed for image-text tasks. However, such models still underperform on rare domain tasks like understanding complex medical images. To address these limitations, we develop a new framework of Knowledge-guided Implicit vision-language Alignment for radiology report generation, named KIA. To better understand medical reports and images and build alignment between them, multi-task implicit alignment is creatively introduced, forming comprehensive understanding of medical images and reports. Additionally, to further meet medical refinement requirements, we design novel masking strategies guided by medical knowledge to enhance pathological observation and anatomical landm
%U https://aclanthology.org/2025.coling-main.276/
%P 4096-4108
Markdown (Informal)
[KIA: Knowledge-Guided Implicit Vision-Language Alignment for Chest X-Ray Report Generation](https://aclanthology.org/2025.coling-main.276/) (Yin et al., COLING 2025)
ACL