@inproceedings{huang-wan-2025-triembed,
title = "{T}ri{E}mbed: Bridge the Gap between Text and Token Indices with Embedding Reparameterization",
author = "Huang, Baizhou and
Wan, Xiaojun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.275/",
doi = "10.18653/v1/2025.findings-acl.275",
pages = "5291--5297",
ISBN = "979-8-89176-256-5",
abstract = "The current paradigm of language modeling is a two-stage pipeline that first transforms raw text to token indices, where the distribution is then estimated. It inherently discards linguistic relations between tokens during tokenization, creating a fundamental gap. To address this, we propose \textbf{TriEmbed}, a reparameterization method for embeddings that incorporates the morphological relationships inherent in subword tokenizer algorithms. Specifically, by organizing the vocabulary into a Trie structure, we can encode these relations and reparametrize the embeddings, facilitating the recovery of other linguistic relationships during training. Empirical results across various settings demonstrate that TriEmbed outperforms conventional embeddings from the perspective of scaling, while offering more linguistically informative token embeddings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-wan-2025-triembed">
<titleInfo>
<title>TriEmbed: Bridge the Gap between Text and Token Indices with Embedding Reparameterization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baizhou</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>The current paradigm of language modeling is a two-stage pipeline that first transforms raw text to token indices, where the distribution is then estimated. It inherently discards linguistic relations between tokens during tokenization, creating a fundamental gap. To address this, we propose TriEmbed, a reparameterization method for embeddings that incorporates the morphological relationships inherent in subword tokenizer algorithms. Specifically, by organizing the vocabulary into a Trie structure, we can encode these relations and reparametrize the embeddings, facilitating the recovery of other linguistic relationships during training. Empirical results across various settings demonstrate that TriEmbed outperforms conventional embeddings from the perspective of scaling, while offering more linguistically informative token embeddings.</abstract>
<identifier type="citekey">huang-wan-2025-triembed</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.275</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.275/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>5291</start>
<end>5297</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TriEmbed: Bridge the Gap between Text and Token Indices with Embedding Reparameterization
%A Huang, Baizhou
%A Wan, Xiaojun
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F huang-wan-2025-triembed
%X The current paradigm of language modeling is a two-stage pipeline that first transforms raw text to token indices, where the distribution is then estimated. It inherently discards linguistic relations between tokens during tokenization, creating a fundamental gap. To address this, we propose TriEmbed, a reparameterization method for embeddings that incorporates the morphological relationships inherent in subword tokenizer algorithms. Specifically, by organizing the vocabulary into a Trie structure, we can encode these relations and reparametrize the embeddings, facilitating the recovery of other linguistic relationships during training. Empirical results across various settings demonstrate that TriEmbed outperforms conventional embeddings from the perspective of scaling, while offering more linguistically informative token embeddings.
%R 10.18653/v1/2025.findings-acl.275
%U https://aclanthology.org/2025.findings-acl.275/
%U https://doi.org/10.18653/v1/2025.findings-acl.275
%P 5291-5297
Markdown (Informal)
[TriEmbed: Bridge the Gap between Text and Token Indices with Embedding Reparameterization](https://aclanthology.org/2025.findings-acl.275/) (Huang & Wan, Findings 2025)
ACL