@inproceedings{oh-lee-2026-late,
title = "Late Code Chunking: A Code Chunking Strategy for Repository-Level Code Completion",
author = "Oh, Seungmin and
Lee, Eunseok",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-short.64/",
pages = "780--786",
ISBN = "979-8-89176-391-3",
abstract = "This paper introduces Late Code Chunking (LC$^2$), a chunking strategy designed to improve the semantic understanding of code segments for Large Language Models (LLMs). Repository-level code completion requires predicting the completion of unfinished code by leveraging cross-file context spread across a repository. However, when retrieved fragments have missing semantics{---}the loss of structural or behavioral information during chunking{---}LLMs struggle to interpret the target code. To address this, LC$^2$ refines retrieved chunks by constructing a dual context: a ``Code Retrieval Context'' optimized for similarity-based search, and a ``Code Comprehension Context'' that serves as a late enrichment step through context expansion and augmentation. This dual-context design reduces information loss due to chunking and enhances the ability of LLMs to utilize retrieved code. Additionally, we introduce an Asymmetric Query-Chunk Sizing strategy to further optimize retrieval quality by minimizing query noise. Our experiments demonstrate that LC$^2$ provides robust performance gains, achieving a statistically significant 19.7{\%} improvement in Exact Match accuracy on the CrossCodeEval benchmark compared to the best existing chunking method."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="oh-lee-2026-late">
<titleInfo>
<title>Late Code Chunking: A Code Chunking Strategy for Repository-Level Code Completion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seungmin</namePart>
<namePart type="family">Oh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunseok</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-391-3</identifier>
</relatedItem>
<abstract>This paper introduces Late Code Chunking (LC²), a chunking strategy designed to improve the semantic understanding of code segments for Large Language Models (LLMs). Repository-level code completion requires predicting the completion of unfinished code by leveraging cross-file context spread across a repository. However, when retrieved fragments have missing semantics—the loss of structural or behavioral information during chunking—LLMs struggle to interpret the target code. To address this, LC² refines retrieved chunks by constructing a dual context: a “Code Retrieval Context” optimized for similarity-based search, and a “Code Comprehension Context” that serves as a late enrichment step through context expansion and augmentation. This dual-context design reduces information loss due to chunking and enhances the ability of LLMs to utilize retrieved code. Additionally, we introduce an Asymmetric Query-Chunk Sizing strategy to further optimize retrieval quality by minimizing query noise. Our experiments demonstrate that LC² provides robust performance gains, achieving a statistically significant 19.7% improvement in Exact Match accuracy on the CrossCodeEval benchmark compared to the best existing chunking method.</abstract>
<identifier type="citekey">oh-lee-2026-late</identifier>
<location>
<url>https://aclanthology.org/2026.acl-short.64/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>780</start>
<end>786</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Late Code Chunking: A Code Chunking Strategy for Repository-Level Code Completion
%A Oh, Seungmin
%A Lee, Eunseok
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-391-3
%F oh-lee-2026-late
%X This paper introduces Late Code Chunking (LC²), a chunking strategy designed to improve the semantic understanding of code segments for Large Language Models (LLMs). Repository-level code completion requires predicting the completion of unfinished code by leveraging cross-file context spread across a repository. However, when retrieved fragments have missing semantics—the loss of structural or behavioral information during chunking—LLMs struggle to interpret the target code. To address this, LC² refines retrieved chunks by constructing a dual context: a “Code Retrieval Context” optimized for similarity-based search, and a “Code Comprehension Context” that serves as a late enrichment step through context expansion and augmentation. This dual-context design reduces information loss due to chunking and enhances the ability of LLMs to utilize retrieved code. Additionally, we introduce an Asymmetric Query-Chunk Sizing strategy to further optimize retrieval quality by minimizing query noise. Our experiments demonstrate that LC² provides robust performance gains, achieving a statistically significant 19.7% improvement in Exact Match accuracy on the CrossCodeEval benchmark compared to the best existing chunking method.
%U https://aclanthology.org/2026.acl-short.64/
%P 780-786
Markdown (Informal)
[Late Code Chunking: A Code Chunking Strategy for Repository-Level Code Completion](https://aclanthology.org/2026.acl-short.64/) (Oh & Lee, ACL 2026)
ACL