@inproceedings{zhang-etal-2025-casereportcollective,
title = "{C}ase{R}eport{C}ollective: A Large-Scale {LLM}-Extracted Dataset for Structured Medical Case Reports",
author = "Zhang, Xiao Yu Cindy and
Fong, Melissa and
Wasserman, Wyeth and
Zhu, Jian",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Miwa, Makoto and
Tsujii, Junichi",
booktitle = "Proceedings of the 24th Workshop on Biomedical Language Processing",
month = aug,
year = "2025",
address = "Viena, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bionlp-1.22/",
doi = "10.18653/v1/2025.bionlp-1.22",
pages = "249--262",
ISBN = "979-8-89176-275-6",
abstract = "Case reports provide critical insights into rare and atypical diseases, but extracting structured knowledge remains challenging due to unstructured text and domain-specific terminology. We introduce CaseReportCollective, an LLM-extracted dataset of 85,961 open-access case reports spanning 37 years across 14 medical domains, validated through programmatic and human evaluation. Our dataset reveals key publication and demographic trends, including a significant increase in open-access case reports over the past decade, shifts in focus from oncology to COVID-19, and sex disparities in reporting across different medical conditions. Over time, the gap between male and female case reports has narrowed, suggesting greater equity in case reporting. Using CaseReportCollective, we further explore embedding-based retrieval for similar medical topics through accumulated similarity scores across extracted structured information. We also conducted detailed error analyses on the retrieval ranking, finding that high-reported topics dominate retrieval. Such retrieval is driven by lexical overlap rather than underlying clinical relevance, often failing to distinguish between semantically similar yet mechanistically distinct conditions. Future work should focus on clinical-aware embeddings adjusted for long-tailed case distributions to improve retrieval accuracy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-casereportcollective">
<titleInfo>
<title>CaseReportCollective: A Large-Scale LLM-Extracted Dataset for Structured Medical Case Reports</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="given">Yu</namePart>
<namePart type="given">Cindy</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melissa</namePart>
<namePart type="family">Fong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wyeth</namePart>
<namePart type="family">Wasserman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makoto</namePart>
<namePart type="family">Miwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Viena, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-275-6</identifier>
</relatedItem>
<abstract>Case reports provide critical insights into rare and atypical diseases, but extracting structured knowledge remains challenging due to unstructured text and domain-specific terminology. We introduce CaseReportCollective, an LLM-extracted dataset of 85,961 open-access case reports spanning 37 years across 14 medical domains, validated through programmatic and human evaluation. Our dataset reveals key publication and demographic trends, including a significant increase in open-access case reports over the past decade, shifts in focus from oncology to COVID-19, and sex disparities in reporting across different medical conditions. Over time, the gap between male and female case reports has narrowed, suggesting greater equity in case reporting. Using CaseReportCollective, we further explore embedding-based retrieval for similar medical topics through accumulated similarity scores across extracted structured information. We also conducted detailed error analyses on the retrieval ranking, finding that high-reported topics dominate retrieval. Such retrieval is driven by lexical overlap rather than underlying clinical relevance, often failing to distinguish between semantically similar yet mechanistically distinct conditions. Future work should focus on clinical-aware embeddings adjusted for long-tailed case distributions to improve retrieval accuracy.</abstract>
<identifier type="citekey">zhang-etal-2025-casereportcollective</identifier>
<identifier type="doi">10.18653/v1/2025.bionlp-1.22</identifier>
<location>
<url>https://aclanthology.org/2025.bionlp-1.22/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>249</start>
<end>262</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CaseReportCollective: A Large-Scale LLM-Extracted Dataset for Structured Medical Case Reports
%A Zhang, Xiao Yu Cindy
%A Fong, Melissa
%A Wasserman, Wyeth
%A Zhu, Jian
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Miwa, Makoto
%Y Tsujii, Junichi
%S Proceedings of the 24th Workshop on Biomedical Language Processing
%D 2025
%8 August
%I Association for Computational Linguistics
%C Viena, Austria
%@ 979-8-89176-275-6
%F zhang-etal-2025-casereportcollective
%X Case reports provide critical insights into rare and atypical diseases, but extracting structured knowledge remains challenging due to unstructured text and domain-specific terminology. We introduce CaseReportCollective, an LLM-extracted dataset of 85,961 open-access case reports spanning 37 years across 14 medical domains, validated through programmatic and human evaluation. Our dataset reveals key publication and demographic trends, including a significant increase in open-access case reports over the past decade, shifts in focus from oncology to COVID-19, and sex disparities in reporting across different medical conditions. Over time, the gap between male and female case reports has narrowed, suggesting greater equity in case reporting. Using CaseReportCollective, we further explore embedding-based retrieval for similar medical topics through accumulated similarity scores across extracted structured information. We also conducted detailed error analyses on the retrieval ranking, finding that high-reported topics dominate retrieval. Such retrieval is driven by lexical overlap rather than underlying clinical relevance, often failing to distinguish between semantically similar yet mechanistically distinct conditions. Future work should focus on clinical-aware embeddings adjusted for long-tailed case distributions to improve retrieval accuracy.
%R 10.18653/v1/2025.bionlp-1.22
%U https://aclanthology.org/2025.bionlp-1.22/
%U https://doi.org/10.18653/v1/2025.bionlp-1.22
%P 249-262
Markdown (Informal)
[CaseReportCollective: A Large-Scale LLM-Extracted Dataset for Structured Medical Case Reports](https://aclanthology.org/2025.bionlp-1.22/) (Zhang et al., BioNLP 2025)
ACL