@inproceedings{gu-etal-2025-overlapping,
title = "Overlapping Context with Variable-Length Stride Increases Diversity when Training Large Language Model for Code",
author = "Gu, Geonmo and
Kwak, Jaeho and
Moon, Haksoo and
Shim, Hyun Seung and
Kim, Yu Jin and
Kim, Byoungjip and
Lee, Moontae and
Jeon, Hyejeong",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.32/",
doi = "10.18653/v1/2025.acl-industry.32",
pages = "456--468",
ISBN = "979-8-89176-288-6",
abstract = "The pretraining of code LLMs typically begins with general data and progresses to domain-specific data through sequential stages. In the latter stages, a challenging issue is that the data of a target domain can be limited in size, and conventional approach of increasing the number of epochs does not lead to a performance gain. In this paper, we propose a novel packing method, which is extracting overlapping contexts from the training data using variable-length stride. Our method can mitigate the data-scarcity issue by providing more diverse and abundant examples of next token prediction than non-overlapping contexts. While the training time of our approach is increased proportionally to the amount of augmented examples, we present space-efficient implementations to store overlapping contexts. Extensive experiments with real datasets show that our approach outperforms the conventional approach of controlling the number of epochs in terms of the pass@$k$ rate."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gu-etal-2025-overlapping">
<titleInfo>
<title>Overlapping Context with Variable-Length Stride Increases Diversity when Training Large Language Model for Code</title>
</titleInfo>
<name type="personal">
<namePart type="given">Geonmo</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaeho</namePart>
<namePart type="family">Kwak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haksoo</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyun</namePart>
<namePart type="given">Seung</namePart>
<namePart type="family">Shim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="given">Jin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byoungjip</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moontae</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyejeong</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>The pretraining of code LLMs typically begins with general data and progresses to domain-specific data through sequential stages. In the latter stages, a challenging issue is that the data of a target domain can be limited in size, and conventional approach of increasing the number of epochs does not lead to a performance gain. In this paper, we propose a novel packing method, which is extracting overlapping contexts from the training data using variable-length stride. Our method can mitigate the data-scarcity issue by providing more diverse and abundant examples of next token prediction than non-overlapping contexts. While the training time of our approach is increased proportionally to the amount of augmented examples, we present space-efficient implementations to store overlapping contexts. Extensive experiments with real datasets show that our approach outperforms the conventional approach of controlling the number of epochs in terms of the pass@k rate.</abstract>
<identifier type="citekey">gu-etal-2025-overlapping</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.32</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.32/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>456</start>
<end>468</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Overlapping Context with Variable-Length Stride Increases Diversity when Training Large Language Model for Code
%A Gu, Geonmo
%A Kwak, Jaeho
%A Moon, Haksoo
%A Shim, Hyun Seung
%A Kim, Yu Jin
%A Kim, Byoungjip
%A Lee, Moontae
%A Jeon, Hyejeong
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F gu-etal-2025-overlapping
%X The pretraining of code LLMs typically begins with general data and progresses to domain-specific data through sequential stages. In the latter stages, a challenging issue is that the data of a target domain can be limited in size, and conventional approach of increasing the number of epochs does not lead to a performance gain. In this paper, we propose a novel packing method, which is extracting overlapping contexts from the training data using variable-length stride. Our method can mitigate the data-scarcity issue by providing more diverse and abundant examples of next token prediction than non-overlapping contexts. While the training time of our approach is increased proportionally to the amount of augmented examples, we present space-efficient implementations to store overlapping contexts. Extensive experiments with real datasets show that our approach outperforms the conventional approach of controlling the number of epochs in terms of the pass@k rate.
%R 10.18653/v1/2025.acl-industry.32
%U https://aclanthology.org/2025.acl-industry.32/
%U https://doi.org/10.18653/v1/2025.acl-industry.32
%P 456-468
Markdown (Informal)
[Overlapping Context with Variable-Length Stride Increases Diversity when Training Large Language Model for Code](https://aclanthology.org/2025.acl-industry.32/) (Gu et al., ACL 2025)
ACL
- Geonmo Gu, Jaeho Kwak, Haksoo Moon, Hyun Seung Shim, Yu Jin Kim, Byoungjip Kim, Moontae Lee, and Hyejeong Jeon. 2025. Overlapping Context with Variable-Length Stride Increases Diversity when Training Large Language Model for Code. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track), pages 456–468, Vienna, Austria. Association for Computational Linguistics.