@inproceedings{li-etal-2025-automir,
title = "{A}uto{MIR}: Effective Zero-Shot Medical Information Retrieval without Relevance Labels",
author = "Li, Lei and
Zhang, Xiangxu and
Zhou, Xiao and
Liu, Zheng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1305/",
pages = "24028--24047",
ISBN = "979-8-89176-335-7",
abstract = "Medical information retrieval (MIR) is vital for accessing knowledge from electronic health records, scientific literature, and medical databases, supporting applications such as medical education, patient queries, and clinical diagnosis. However, effective zero-shot dense retrieval in the medical domain remains difficult due to the scarcity of relevance-labeled data. To address this challenge, we propose **S**elf-**L**earning **Hy**pothetical **D**ocument **E**mbeddings (**SL-HyDE**), a framework that leverages large language models (LLMs) to generate hypothetical documents conditioned on a query. These documents encapsulate essential medical context, guiding dense retrievers toward the most relevant results. SL-HyDE further employs a self-learning mechanism that iteratively improves pseudo-document generation and retrieval using unlabeled corpora, eliminating the need for labeled data. In addition, we introduce the Chinese Medical Information Retrieval Benchmark (CMIRB), a comprehensive evaluation suite reflecting real-world medical scenarios, comprising five tasks and ten datasets. By benchmarking ten models on CMIRB, we provide a rigorous standard for evaluating MIR systems. Experimental results demonstrate that SL-HyDE significantly outperforms HyDE in retrieval accuracy, while exhibiting strong generalization and scalability across diverse LLM and retriever configurations. Our code and data are publicly available at: https://github.com/ll0ruc/AutoMIR."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-automir">
<titleInfo>
<title>AutoMIR: Effective Zero-Shot Medical Information Retrieval without Relevance Labels</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiangxu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Medical information retrieval (MIR) is vital for accessing knowledge from electronic health records, scientific literature, and medical databases, supporting applications such as medical education, patient queries, and clinical diagnosis. However, effective zero-shot dense retrieval in the medical domain remains difficult due to the scarcity of relevance-labeled data. To address this challenge, we propose **S**elf-**L**earning **Hy**pothetical **D**ocument **E**mbeddings (**SL-HyDE**), a framework that leverages large language models (LLMs) to generate hypothetical documents conditioned on a query. These documents encapsulate essential medical context, guiding dense retrievers toward the most relevant results. SL-HyDE further employs a self-learning mechanism that iteratively improves pseudo-document generation and retrieval using unlabeled corpora, eliminating the need for labeled data. In addition, we introduce the Chinese Medical Information Retrieval Benchmark (CMIRB), a comprehensive evaluation suite reflecting real-world medical scenarios, comprising five tasks and ten datasets. By benchmarking ten models on CMIRB, we provide a rigorous standard for evaluating MIR systems. Experimental results demonstrate that SL-HyDE significantly outperforms HyDE in retrieval accuracy, while exhibiting strong generalization and scalability across diverse LLM and retriever configurations. Our code and data are publicly available at: https://github.com/ll0ruc/AutoMIR.</abstract>
<identifier type="citekey">li-etal-2025-automir</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1305/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>24028</start>
<end>24047</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AutoMIR: Effective Zero-Shot Medical Information Retrieval without Relevance Labels
%A Li, Lei
%A Zhang, Xiangxu
%A Zhou, Xiao
%A Liu, Zheng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F li-etal-2025-automir
%X Medical information retrieval (MIR) is vital for accessing knowledge from electronic health records, scientific literature, and medical databases, supporting applications such as medical education, patient queries, and clinical diagnosis. However, effective zero-shot dense retrieval in the medical domain remains difficult due to the scarcity of relevance-labeled data. To address this challenge, we propose **S**elf-**L**earning **Hy**pothetical **D**ocument **E**mbeddings (**SL-HyDE**), a framework that leverages large language models (LLMs) to generate hypothetical documents conditioned on a query. These documents encapsulate essential medical context, guiding dense retrievers toward the most relevant results. SL-HyDE further employs a self-learning mechanism that iteratively improves pseudo-document generation and retrieval using unlabeled corpora, eliminating the need for labeled data. In addition, we introduce the Chinese Medical Information Retrieval Benchmark (CMIRB), a comprehensive evaluation suite reflecting real-world medical scenarios, comprising five tasks and ten datasets. By benchmarking ten models on CMIRB, we provide a rigorous standard for evaluating MIR systems. Experimental results demonstrate that SL-HyDE significantly outperforms HyDE in retrieval accuracy, while exhibiting strong generalization and scalability across diverse LLM and retriever configurations. Our code and data are publicly available at: https://github.com/ll0ruc/AutoMIR.
%U https://aclanthology.org/2025.findings-emnlp.1305/
%P 24028-24047
Markdown (Informal)
[AutoMIR: Effective Zero-Shot Medical Information Retrieval without Relevance Labels](https://aclanthology.org/2025.findings-emnlp.1305/) (Li et al., Findings 2025)
ACL