@inproceedings{chen-etal-2026-reasonembed,
title = "{R}eason{E}mbed: Enhanced Text Embeddings for Reasoning-Intensive Document Retrieval",
author = "Chen, Jianlyu and
Lan, Junwei and
Li, Chaofan and
Lian, Defu and
Liu, Zheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.54/",
pages = "1203--1221",
ISBN = "979-8-89176-390-6",
abstract = "In this paper, we introduce **ReasonEmbed**, a novel text embedding model developed for reasoning-intensive document retrieval. Our work includes three key technical contributions. First, we propose **ReMixer**, a new data synthesis method that overcomes the triviality problem prevalent in previous synthetic datasets, enabling large-scale production of 82K high-quality training samples. Second, we design **Redapter**, a self-adaptive learning algorithm that dynamically adjusts training each sample{'}s weight based on its reasoning intensity. This allows the model to effectively capture the complex semantic relationships between queries and documents. Third, we implement ReasonEmbed across multiple backbones of varying sizes, all of which achieve **superior performance** on reasoning-intensive retrieval tasks. Notably, our ReasonEmbed-Qwen3-8B model offers a record-high nDCG@10 score of 38.1 on the BRIGHT benchmark, which significantly outperforms existing text embedding models. We will fully open-source our created resource in ReasonEmbed to push forward the research advancement in this field."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-reasonembed">
<titleInfo>
<title>ReasonEmbed: Enhanced Text Embeddings for Reasoning-Intensive Document Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianlyu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junwei</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaofan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Defu</namePart>
<namePart type="family">Lian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>In this paper, we introduce **ReasonEmbed**, a novel text embedding model developed for reasoning-intensive document retrieval. Our work includes three key technical contributions. First, we propose **ReMixer**, a new data synthesis method that overcomes the triviality problem prevalent in previous synthetic datasets, enabling large-scale production of 82K high-quality training samples. Second, we design **Redapter**, a self-adaptive learning algorithm that dynamically adjusts training each sample’s weight based on its reasoning intensity. This allows the model to effectively capture the complex semantic relationships between queries and documents. Third, we implement ReasonEmbed across multiple backbones of varying sizes, all of which achieve **superior performance** on reasoning-intensive retrieval tasks. Notably, our ReasonEmbed-Qwen3-8B model offers a record-high nDCG@10 score of 38.1 on the BRIGHT benchmark, which significantly outperforms existing text embedding models. We will fully open-source our created resource in ReasonEmbed to push forward the research advancement in this field.</abstract>
<identifier type="citekey">chen-etal-2026-reasonembed</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.54/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1203</start>
<end>1221</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReasonEmbed: Enhanced Text Embeddings for Reasoning-Intensive Document Retrieval
%A Chen, Jianlyu
%A Lan, Junwei
%A Li, Chaofan
%A Lian, Defu
%A Liu, Zheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F chen-etal-2026-reasonembed
%X In this paper, we introduce **ReasonEmbed**, a novel text embedding model developed for reasoning-intensive document retrieval. Our work includes three key technical contributions. First, we propose **ReMixer**, a new data synthesis method that overcomes the triviality problem prevalent in previous synthetic datasets, enabling large-scale production of 82K high-quality training samples. Second, we design **Redapter**, a self-adaptive learning algorithm that dynamically adjusts training each sample’s weight based on its reasoning intensity. This allows the model to effectively capture the complex semantic relationships between queries and documents. Third, we implement ReasonEmbed across multiple backbones of varying sizes, all of which achieve **superior performance** on reasoning-intensive retrieval tasks. Notably, our ReasonEmbed-Qwen3-8B model offers a record-high nDCG@10 score of 38.1 on the BRIGHT benchmark, which significantly outperforms existing text embedding models. We will fully open-source our created resource in ReasonEmbed to push forward the research advancement in this field.
%U https://aclanthology.org/2026.acl-long.54/
%P 1203-1221
Markdown (Informal)
[ReasonEmbed: Enhanced Text Embeddings for Reasoning-Intensive Document Retrieval](https://aclanthology.org/2026.acl-long.54/) (Chen et al., ACL 2026)
ACL