@inproceedings{xu-etal-2026-128k,
title = "From 128{K} to 4{M}: Efficient Training of Ultra-Long Context Large Language Models",
author = "Xu, Chejian and
Ping, Wei and
Xu, Peng and
Liu, Zihan and
Wang, Boxin and
Shoeybi, Mohammad and
Li, Bo and
Catanzaro, Bryan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.640/",
pages = "13122--13133",
ISBN = "979-8-89176-395-1",
abstract = "Long-context capabilities are essential for a wide range of applications, including document and video understanding, in-context learning, and inference-time scaling, all of which require models to process and reason over long sequences of text and multimodal data. In this work, we introduce an efficient training recipe for building ultra-long context LLMs from aligned instruct model, pushing the boundaries of context lengths from 128K to 1M, 2M, and 4M tokens. Our approach leverages continued pretraining strategies to extend the context window, while employing efficient instruction tuning to maintain short context capabilities. Our UltraLong-8B, built on Llama-3.1-Instruct, achieves state-of-the-art performance across a diverse set of long-context benchmarks. Importantly, UltraLong-8B also maintains competitive performance on standard benchmarks, showing balanced improvements for both long and short context tasks. We provide an in-depth analysis of key design choices, highlighting the impacts of scaling strategies and data composition. Our findings establish a robust framework for efficiently scaling context lengths while preserving general model capabilities. We released all model weights for open research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2026-128k">
<titleInfo>
<title>From 128K to 4M: Efficient Training of Ultra-Long Context Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chejian</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Ping</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zihan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boxin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Shoeybi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Catanzaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Long-context capabilities are essential for a wide range of applications, including document and video understanding, in-context learning, and inference-time scaling, all of which require models to process and reason over long sequences of text and multimodal data. In this work, we introduce an efficient training recipe for building ultra-long context LLMs from aligned instruct model, pushing the boundaries of context lengths from 128K to 1M, 2M, and 4M tokens. Our approach leverages continued pretraining strategies to extend the context window, while employing efficient instruction tuning to maintain short context capabilities. Our UltraLong-8B, built on Llama-3.1-Instruct, achieves state-of-the-art performance across a diverse set of long-context benchmarks. Importantly, UltraLong-8B also maintains competitive performance on standard benchmarks, showing balanced improvements for both long and short context tasks. We provide an in-depth analysis of key design choices, highlighting the impacts of scaling strategies and data composition. Our findings establish a robust framework for efficiently scaling context lengths while preserving general model capabilities. We released all model weights for open research.</abstract>
<identifier type="citekey">xu-etal-2026-128k</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.640/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>13122</start>
<end>13133</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From 128K to 4M: Efficient Training of Ultra-Long Context Large Language Models
%A Xu, Chejian
%A Ping, Wei
%A Xu, Peng
%A Liu, Zihan
%A Wang, Boxin
%A Shoeybi, Mohammad
%A Li, Bo
%A Catanzaro, Bryan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F xu-etal-2026-128k
%X Long-context capabilities are essential for a wide range of applications, including document and video understanding, in-context learning, and inference-time scaling, all of which require models to process and reason over long sequences of text and multimodal data. In this work, we introduce an efficient training recipe for building ultra-long context LLMs from aligned instruct model, pushing the boundaries of context lengths from 128K to 1M, 2M, and 4M tokens. Our approach leverages continued pretraining strategies to extend the context window, while employing efficient instruction tuning to maintain short context capabilities. Our UltraLong-8B, built on Llama-3.1-Instruct, achieves state-of-the-art performance across a diverse set of long-context benchmarks. Importantly, UltraLong-8B also maintains competitive performance on standard benchmarks, showing balanced improvements for both long and short context tasks. We provide an in-depth analysis of key design choices, highlighting the impacts of scaling strategies and data composition. Our findings establish a robust framework for efficiently scaling context lengths while preserving general model capabilities. We released all model weights for open research.
%U https://aclanthology.org/2026.findings-acl.640/
%P 13122-13133
Markdown (Informal)
[From 128K to 4M: Efficient Training of Ultra-Long Context Large Language Models](https://aclanthology.org/2026.findings-acl.640/) (Xu et al., Findings 2026)
ACL
- Chejian Xu, Wei Ping, Peng Xu, Zihan Liu, Boxin Wang, Mohammad Shoeybi, Bo Li, and Bryan Catanzaro. 2026. From 128K to 4M: Efficient Training of Ultra-Long Context Large Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 13122–13133, San Diego, California, United States. Association for Computational Linguistics.