@inproceedings{wang-etal-2026-unigem,
title = "{U}ni{G}e{M}: Unifying Data Selection and Mixing via Geometric Exploration and Mining",
author = "Wang, Changhao and
Yunfeiyu and
Yao, Xinhao and
Yang, Jiaolong and
Yu, Lu and
Fang, Junpeng and
Li, Chaobo and
Cantoro, Riccardo and
Cui, Qing and
Zhou, Jun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1037/",
pages = "20696--20714",
ISBN = "979-8-89176-395-1",
abstract = "The scaling of Large Language Models (LLMs) is increasingly limited by data quality. Most methods handle data mixing and sample selection separately, which can break the structure in code corpora. We introduce UniGeM, a framework that unifies mixing and selection by treating data curation as a manifold approximation problem without training proxy models or relying on external reference datasets. UniGeM operates hierarchically: Macro-Exploration learns mixing weights with stability-based clustering; Micro-Mining filters high-quality instances by their geometric distribution to ensure logical consistency. Validated by training 8B and 16B MoE models on 100B tokens, UniGeM achieves 2.0 $\times$ data efficiency over a random baseline and further improves overall performance compared to SOTA methods in reasoning-heavy evaluations and multilingual generalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-unigem">
<titleInfo>
<title>UniGeM: Unifying Data Selection and Mixing via Geometric Exploration and Mining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Changhao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Yunfeiyu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinhao</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaolong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junpeng</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaobo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riccardo</namePart>
<namePart type="family">Cantoro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qing</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>The scaling of Large Language Models (LLMs) is increasingly limited by data quality. Most methods handle data mixing and sample selection separately, which can break the structure in code corpora. We introduce UniGeM, a framework that unifies mixing and selection by treating data curation as a manifold approximation problem without training proxy models or relying on external reference datasets. UniGeM operates hierarchically: Macro-Exploration learns mixing weights with stability-based clustering; Micro-Mining filters high-quality instances by their geometric distribution to ensure logical consistency. Validated by training 8B and 16B MoE models on 100B tokens, UniGeM achieves 2.0 \times data efficiency over a random baseline and further improves overall performance compared to SOTA methods in reasoning-heavy evaluations and multilingual generalization.</abstract>
<identifier type="citekey">wang-etal-2026-unigem</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1037/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>20696</start>
<end>20714</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UniGeM: Unifying Data Selection and Mixing via Geometric Exploration and Mining
%A Wang, Changhao
%A Yao, Xinhao
%A Yang, Jiaolong
%A Yu, Lu
%A Fang, Junpeng
%A Li, Chaobo
%A Cantoro, Riccardo
%A Cui, Qing
%A Zhou, Jun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Yunfeiyu
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F wang-etal-2026-unigem
%X The scaling of Large Language Models (LLMs) is increasingly limited by data quality. Most methods handle data mixing and sample selection separately, which can break the structure in code corpora. We introduce UniGeM, a framework that unifies mixing and selection by treating data curation as a manifold approximation problem without training proxy models or relying on external reference datasets. UniGeM operates hierarchically: Macro-Exploration learns mixing weights with stability-based clustering; Micro-Mining filters high-quality instances by their geometric distribution to ensure logical consistency. Validated by training 8B and 16B MoE models on 100B tokens, UniGeM achieves 2.0 \times data efficiency over a random baseline and further improves overall performance compared to SOTA methods in reasoning-heavy evaluations and multilingual generalization.
%U https://aclanthology.org/2026.findings-acl.1037/
%P 20696-20714
Markdown (Informal)
[UniGeM: Unifying Data Selection and Mixing via Geometric Exploration and Mining](https://aclanthology.org/2026.findings-acl.1037/) (Wang et al., Findings 2026)
ACL
- Changhao Wang, Yunfeiyu, Xinhao Yao, Jiaolong Yang, Lu Yu, Junpeng Fang, Chaobo Li, Riccardo Cantoro, Qing Cui, and Jun Zhou. 2026. UniGeM: Unifying Data Selection and Mixing via Geometric Exploration and Mining. In Findings of the Association for Computational Linguistics: ACL 2026, pages 20696–20714, San Diego, California, United States. Association for Computational Linguistics.