@inproceedings{deiseroth-etal-2024-free,
title = "{T}-{FREE}: Subword Tokenizer-Free Generative {LLM}s via Sparse Representations for Memory-Efficient Embeddings",
author = {Deiseroth, Bj{\"o}rn and
Brack, Manuel and
Schramowski, Patrick and
Kersting, Kristian and
Weinbach, Samuel},
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1217",
pages = "21829--21851",
abstract = "Tokenizers are crucial for encoding information in Large Language Models, but their development has recently stagnated, and they contain inherent weaknesses. Major limitations include computational overhead, ineffective vocabulary use, and unnecessarily large embedding and head layers. Additionally, their performance is biased towards a reference corpus, leading to reduced effectiveness for underrepresented languages.To remedy these issues, we propose T-Free, which directly embeds words through sparse activation patterns over character triplets and does not require a reference corpus. T-Free inherently exploits morphological similarities and allows for strong compression of embedding layers. In our exhaustive experimental evaluation, we achieve competitive downstream performance with a parameter reduction of more than 85{\%} on these layers. Further, T-Free shows significant improvements in cross-lingual transfer learning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deiseroth-etal-2024-free">
<titleInfo>
<title>T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse Representations for Memory-Efficient Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Björn</namePart>
<namePart type="family">Deiseroth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Brack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Schramowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="family">Kersting</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Weinbach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tokenizers are crucial for encoding information in Large Language Models, but their development has recently stagnated, and they contain inherent weaknesses. Major limitations include computational overhead, ineffective vocabulary use, and unnecessarily large embedding and head layers. Additionally, their performance is biased towards a reference corpus, leading to reduced effectiveness for underrepresented languages.To remedy these issues, we propose T-Free, which directly embeds words through sparse activation patterns over character triplets and does not require a reference corpus. T-Free inherently exploits morphological similarities and allows for strong compression of embedding layers. In our exhaustive experimental evaluation, we achieve competitive downstream performance with a parameter reduction of more than 85% on these layers. Further, T-Free shows significant improvements in cross-lingual transfer learning.</abstract>
<identifier type="citekey">deiseroth-etal-2024-free</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1217</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21829</start>
<end>21851</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse Representations for Memory-Efficient Embeddings
%A Deiseroth, Björn
%A Brack, Manuel
%A Schramowski, Patrick
%A Kersting, Kristian
%A Weinbach, Samuel
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F deiseroth-etal-2024-free
%X Tokenizers are crucial for encoding information in Large Language Models, but their development has recently stagnated, and they contain inherent weaknesses. Major limitations include computational overhead, ineffective vocabulary use, and unnecessarily large embedding and head layers. Additionally, their performance is biased towards a reference corpus, leading to reduced effectiveness for underrepresented languages.To remedy these issues, we propose T-Free, which directly embeds words through sparse activation patterns over character triplets and does not require a reference corpus. T-Free inherently exploits morphological similarities and allows for strong compression of embedding layers. In our exhaustive experimental evaluation, we achieve competitive downstream performance with a parameter reduction of more than 85% on these layers. Further, T-Free shows significant improvements in cross-lingual transfer learning.
%U https://aclanthology.org/2024.emnlp-main.1217
%P 21829-21851
Markdown (Informal)
[T-FREE: Subword Tokenizer-Free Generative LLMs via Sparse Representations for Memory-Efficient Embeddings](https://aclanthology.org/2024.emnlp-main.1217) (Deiseroth et al., EMNLP 2024)
ACL