@inproceedings{yu-etal-2025-transfer,
title = "Transfer-Aware Data Selection for Domain Adaptation in Text Retrieval",
author = "Yu, Linzhu and
Li, Huan and
Chen, Ke and
Shou, Lidan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.948/",
pages = "17504--17519",
ISBN = "979-8-89176-335-7",
abstract = "Domain adaptation is widely adopted in text retrieval scenarios where large labeled data is unavailable. To improve model adaptability, existing methods try to expand more source datasets. However, we found from experiments that indiscriminately using a large amount of source data from various text tasks does not guarantee improved adaptability, but may negatively impact model performance. To tackle this issue, we propose Trait, a framework that can effectively improve model adaptability by selecting beneficial data without evaluating all source data. Specifically, we first divide multiple source datasets into data chunks of the same size as the minimum selection unit to form the whole selection space. Then we devise an iterative process that includes Bayesian optimization-based selection and transfer-aware chunk evaluation to incrementally select beneficial chunks. To reduce unnecessary evaluation costs, we also design backtracking and pruning actions to adjust the selection subspace. Extensive experimental results show that Trait not only achieves average state-of-the-art for few-shot on nine target datasets by evaluating only 4{\%} of BERRI source data, but also is very competitive for zero-shot compared with LLM-based rankers."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2025-transfer">
<titleInfo>
<title>Transfer-Aware Data Selection for Domain Adaptation in Text Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Linzhu</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidan</namePart>
<namePart type="family">Shou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Domain adaptation is widely adopted in text retrieval scenarios where large labeled data is unavailable. To improve model adaptability, existing methods try to expand more source datasets. However, we found from experiments that indiscriminately using a large amount of source data from various text tasks does not guarantee improved adaptability, but may negatively impact model performance. To tackle this issue, we propose Trait, a framework that can effectively improve model adaptability by selecting beneficial data without evaluating all source data. Specifically, we first divide multiple source datasets into data chunks of the same size as the minimum selection unit to form the whole selection space. Then we devise an iterative process that includes Bayesian optimization-based selection and transfer-aware chunk evaluation to incrementally select beneficial chunks. To reduce unnecessary evaluation costs, we also design backtracking and pruning actions to adjust the selection subspace. Extensive experimental results show that Trait not only achieves average state-of-the-art for few-shot on nine target datasets by evaluating only 4% of BERRI source data, but also is very competitive for zero-shot compared with LLM-based rankers.</abstract>
<identifier type="citekey">yu-etal-2025-transfer</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.948/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>17504</start>
<end>17519</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transfer-Aware Data Selection for Domain Adaptation in Text Retrieval
%A Yu, Linzhu
%A Li, Huan
%A Chen, Ke
%A Shou, Lidan
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F yu-etal-2025-transfer
%X Domain adaptation is widely adopted in text retrieval scenarios where large labeled data is unavailable. To improve model adaptability, existing methods try to expand more source datasets. However, we found from experiments that indiscriminately using a large amount of source data from various text tasks does not guarantee improved adaptability, but may negatively impact model performance. To tackle this issue, we propose Trait, a framework that can effectively improve model adaptability by selecting beneficial data without evaluating all source data. Specifically, we first divide multiple source datasets into data chunks of the same size as the minimum selection unit to form the whole selection space. Then we devise an iterative process that includes Bayesian optimization-based selection and transfer-aware chunk evaluation to incrementally select beneficial chunks. To reduce unnecessary evaluation costs, we also design backtracking and pruning actions to adjust the selection subspace. Extensive experimental results show that Trait not only achieves average state-of-the-art for few-shot on nine target datasets by evaluating only 4% of BERRI source data, but also is very competitive for zero-shot compared with LLM-based rankers.
%U https://aclanthology.org/2025.findings-emnlp.948/
%P 17504-17519
Markdown (Informal)
[Transfer-Aware Data Selection for Domain Adaptation in Text Retrieval](https://aclanthology.org/2025.findings-emnlp.948/) (Yu et al., Findings 2025)
ACL