@inproceedings{wang-etal-2025-document,
title = "Document Segmentation Matters for Retrieval-Augmented Generation",
author = "Wang, Zhitong and
Gao, Cheng and
Xiao, Chaojun and
Huang, Yufei and
Si, Shuzheng and
Luo, Kangyang and
Bai, Yuzhuo and
Li, Wenhao and
Duan, Tangjian and
Lv, Chuancheng and
Lu, Guoshan and
Chen, Gang and
Qi, Fanchao and
Sun, Maosong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.422/",
doi = "10.18653/v1/2025.findings-acl.422",
pages = "8063--8075",
ISBN = "979-8-89176-256-5",
abstract = "Retrieval-augmented generation (RAG) enhances large language models (LLMs) by integrating external knowledge. A critical yet underexplored challenge in RAG is document segmentation, also known as document chunking. Existing widely-used rule-based chunking methods usually lead to suboptimal splits, where overly large chunks introduce irrelevant information and small chunks lack semantic coherence. Existing semantic-based approaches either require costly LLM calls or fail to adaptively group contextually related sentences. To address these limitations, we propose PIC, Pseudo-Instruction for document Chunking), a simple yet effective method that leverages document summaries as pseudo-instructions to guide chunking. By computing semantic similarity between sentences and the summary, PIC dynamically groups sentences into chunks that align with the document{'}s key themes, ensuring semantic completeness and relevance to potential user instructions. Experiments on multiple open-domain question-answering benchmarks demonstrate that PIC can significantly improve retrieval accuracy (Hits@k) and end-to-end QA performance (Exact Match) without any additional training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-document">
<titleInfo>
<title>Document Segmentation Matters for Retrieval-Augmented Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhitong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaojun</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yufei</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuzheng</namePart>
<namePart type="family">Si</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kangyang</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuzhuo</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tangjian</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuancheng</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guoshan</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanchao</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Retrieval-augmented generation (RAG) enhances large language models (LLMs) by integrating external knowledge. A critical yet underexplored challenge in RAG is document segmentation, also known as document chunking. Existing widely-used rule-based chunking methods usually lead to suboptimal splits, where overly large chunks introduce irrelevant information and small chunks lack semantic coherence. Existing semantic-based approaches either require costly LLM calls or fail to adaptively group contextually related sentences. To address these limitations, we propose PIC, Pseudo-Instruction for document Chunking), a simple yet effective method that leverages document summaries as pseudo-instructions to guide chunking. By computing semantic similarity between sentences and the summary, PIC dynamically groups sentences into chunks that align with the document’s key themes, ensuring semantic completeness and relevance to potential user instructions. Experiments on multiple open-domain question-answering benchmarks demonstrate that PIC can significantly improve retrieval accuracy (Hits@k) and end-to-end QA performance (Exact Match) without any additional training.</abstract>
<identifier type="citekey">wang-etal-2025-document</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.422</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.422/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>8063</start>
<end>8075</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Document Segmentation Matters for Retrieval-Augmented Generation
%A Wang, Zhitong
%A Gao, Cheng
%A Xiao, Chaojun
%A Huang, Yufei
%A Si, Shuzheng
%A Luo, Kangyang
%A Bai, Yuzhuo
%A Li, Wenhao
%A Duan, Tangjian
%A Lv, Chuancheng
%A Lu, Guoshan
%A Chen, Gang
%A Qi, Fanchao
%A Sun, Maosong
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F wang-etal-2025-document
%X Retrieval-augmented generation (RAG) enhances large language models (LLMs) by integrating external knowledge. A critical yet underexplored challenge in RAG is document segmentation, also known as document chunking. Existing widely-used rule-based chunking methods usually lead to suboptimal splits, where overly large chunks introduce irrelevant information and small chunks lack semantic coherence. Existing semantic-based approaches either require costly LLM calls or fail to adaptively group contextually related sentences. To address these limitations, we propose PIC, Pseudo-Instruction for document Chunking), a simple yet effective method that leverages document summaries as pseudo-instructions to guide chunking. By computing semantic similarity between sentences and the summary, PIC dynamically groups sentences into chunks that align with the document’s key themes, ensuring semantic completeness and relevance to potential user instructions. Experiments on multiple open-domain question-answering benchmarks demonstrate that PIC can significantly improve retrieval accuracy (Hits@k) and end-to-end QA performance (Exact Match) without any additional training.
%R 10.18653/v1/2025.findings-acl.422
%U https://aclanthology.org/2025.findings-acl.422/
%U https://doi.org/10.18653/v1/2025.findings-acl.422
%P 8063-8075
Markdown (Informal)
[Document Segmentation Matters for Retrieval-Augmented Generation](https://aclanthology.org/2025.findings-acl.422/) (Wang et al., Findings 2025)
ACL
- Zhitong Wang, Cheng Gao, Chaojun Xiao, Yufei Huang, Shuzheng Si, Kangyang Luo, Yuzhuo Bai, Wenhao Li, Tangjian Duan, Chuancheng Lv, Guoshan Lu, Gang Chen, Fanchao Qi, and Maosong Sun. 2025. Document Segmentation Matters for Retrieval-Augmented Generation. In Findings of the Association for Computational Linguistics: ACL 2025, pages 8063–8075, Vienna, Austria. Association for Computational Linguistics.