@inproceedings{lim-lauw-2025-interpreting,
title = "Interpreting Topic Models in Byte-Pair Encoding Space",
author = "Lim, Jia Peng and
Lauw, Hady",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.720/",
pages = "10810--10838",
abstract = "Byte-pair encoding (BPE) is pivotal for processing text into chunksize tokens, particularly in Large Language Model (LLM). From a topic modeling perspective, as these chunksize tokens might be mere parts of valid words, evaluating and interpreting these tokens for coherence is challenging. Most, if not all, of coherence evaluation measures are incompatible as they benchmark using valid words. We propose to interpret the recovery of valid words from these tokens as a ranking problem and present a model-agnostic and training-free recovery approach from the topic-token distribution onto a selected vocabulary space, following which we could apply existing evaluation measures. Results show that topic sets recovered from BPE vocabulary space are coherent."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lim-lauw-2025-interpreting">
<titleInfo>
<title>Interpreting Topic Models in Byte-Pair Encoding Space</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jia</namePart>
<namePart type="given">Peng</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hady</namePart>
<namePart type="family">Lauw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Byte-pair encoding (BPE) is pivotal for processing text into chunksize tokens, particularly in Large Language Model (LLM). From a topic modeling perspective, as these chunksize tokens might be mere parts of valid words, evaluating and interpreting these tokens for coherence is challenging. Most, if not all, of coherence evaluation measures are incompatible as they benchmark using valid words. We propose to interpret the recovery of valid words from these tokens as a ranking problem and present a model-agnostic and training-free recovery approach from the topic-token distribution onto a selected vocabulary space, following which we could apply existing evaluation measures. Results show that topic sets recovered from BPE vocabulary space are coherent.</abstract>
<identifier type="citekey">lim-lauw-2025-interpreting</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.720/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10810</start>
<end>10838</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Interpreting Topic Models in Byte-Pair Encoding Space
%A Lim, Jia Peng
%A Lauw, Hady
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F lim-lauw-2025-interpreting
%X Byte-pair encoding (BPE) is pivotal for processing text into chunksize tokens, particularly in Large Language Model (LLM). From a topic modeling perspective, as these chunksize tokens might be mere parts of valid words, evaluating and interpreting these tokens for coherence is challenging. Most, if not all, of coherence evaluation measures are incompatible as they benchmark using valid words. We propose to interpret the recovery of valid words from these tokens as a ranking problem and present a model-agnostic and training-free recovery approach from the topic-token distribution onto a selected vocabulary space, following which we could apply existing evaluation measures. Results show that topic sets recovered from BPE vocabulary space are coherent.
%U https://aclanthology.org/2025.coling-main.720/
%P 10810-10838
Markdown (Informal)
[Interpreting Topic Models in Byte-Pair Encoding Space](https://aclanthology.org/2025.coling-main.720/) (Lim & Lauw, COLING 2025)
ACL