@inproceedings{won-etal-2026-trex,
title = "{TR}e{X}: Tokenizer Regression for Optimal Data Mixture",
author = "Won, Inho and
Yoo, Hangyeol and
Cho, Minkyung and
Park, Jungyeul and
Song, Hoyun and
Lim, KyungTae",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.298/",
pages = "6353--6370",
ISBN = "979-8-89176-380-7",
abstract = "Building effective tokenizers for multilingual Large Language Models (LLMs) requires careful control over language-specific data mixtures. While a tokenizer{'}s compression performance critically affects the efficiency of LLM training and inference, existing approaches rely on heuristics or costly large-scale searches to determine optimal language ratios. We introduce $\textbf{T}$okenizer $\textbf{Re}$gression for Optimal Data Mi$\textbf{X}$ture ($\text{TReX}$), a regression-based framework that efficiently predicts the optimal data mixture for tokenizer training. $\text{TReX}$ trains small-scale proxy tokenizers on random mixtures, gathers their compression statistics, and learns to predict compression performance from data mixtures. This learned model enables scalable mixture search before large-scale tokenizer training, mitigating the accuracy-cost trade-off in multilingual tokenizer design. Tokenizers trained with TReX{'}s predicted mixtures outperform mixtures based on LLaMA3 and uniform distributions by up to 12{\%} in both in- and out-of-distribution compression efficiency, demonstrating strong scalability, robustness, and practical effectiveness."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="won-etal-2026-trex">
<titleInfo>
<title>TReX: Tokenizer Regression for Optimal Data Mixture</title>
</titleInfo>
<name type="personal">
<namePart type="given">Inho</namePart>
<namePart type="family">Won</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hangyeol</namePart>
<namePart type="family">Yoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minkyung</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungyeul</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hoyun</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">KyungTae</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Building effective tokenizers for multilingual Large Language Models (LLMs) requires careful control over language-specific data mixtures. While a tokenizer’s compression performance critically affects the efficiency of LLM training and inference, existing approaches rely on heuristics or costly large-scale searches to determine optimal language ratios. We introduce Tokenizer Regression for Optimal Data MiXture (\textTReX), a regression-based framework that efficiently predicts the optimal data mixture for tokenizer training. \textTReX trains small-scale proxy tokenizers on random mixtures, gathers their compression statistics, and learns to predict compression performance from data mixtures. This learned model enables scalable mixture search before large-scale tokenizer training, mitigating the accuracy-cost trade-off in multilingual tokenizer design. Tokenizers trained with TReX’s predicted mixtures outperform mixtures based on LLaMA3 and uniform distributions by up to 12% in both in- and out-of-distribution compression efficiency, demonstrating strong scalability, robustness, and practical effectiveness.</abstract>
<identifier type="citekey">won-etal-2026-trex</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.298/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>6353</start>
<end>6370</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TReX: Tokenizer Regression for Optimal Data Mixture
%A Won, Inho
%A Yoo, Hangyeol
%A Cho, Minkyung
%A Park, Jungyeul
%A Song, Hoyun
%A Lim, KyungTae
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F won-etal-2026-trex
%X Building effective tokenizers for multilingual Large Language Models (LLMs) requires careful control over language-specific data mixtures. While a tokenizer’s compression performance critically affects the efficiency of LLM training and inference, existing approaches rely on heuristics or costly large-scale searches to determine optimal language ratios. We introduce Tokenizer Regression for Optimal Data MiXture (\textTReX), a regression-based framework that efficiently predicts the optimal data mixture for tokenizer training. \textTReX trains small-scale proxy tokenizers on random mixtures, gathers their compression statistics, and learns to predict compression performance from data mixtures. This learned model enables scalable mixture search before large-scale tokenizer training, mitigating the accuracy-cost trade-off in multilingual tokenizer design. Tokenizers trained with TReX’s predicted mixtures outperform mixtures based on LLaMA3 and uniform distributions by up to 12% in both in- and out-of-distribution compression efficiency, demonstrating strong scalability, robustness, and practical effectiveness.
%U https://aclanthology.org/2026.eacl-long.298/
%P 6353-6370
Markdown (Informal)
[TReX: Tokenizer Regression for Optimal Data Mixture](https://aclanthology.org/2026.eacl-long.298/) (Won et al., EACL 2026)
ACL
- Inho Won, Hangyeol Yoo, Minkyung Cho, Jungyeul Park, Hoyun Song, and KyungTae Lim. 2026. TReX: Tokenizer Regression for Optimal Data Mixture. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6353–6370, Rabat, Morocco. Association for Computational Linguistics.