@inproceedings{khan-etal-2025-tokensmith,
title = "{T}oken{S}mith: Streamlining Data Editing, Search, and Inspection for Large-Scale Language Model Training and Interpretability",
author = "Khan, Mohammad Aflah and
Godbole, Ameya and
Wei, Johnny and
Wang, Ryan Yixiang and
Flemings, James and
Gummadi, Krishna P. and
Neiswanger, Willie and
Jia, Robin",
editor = {Habernal, Ivan and
Schulam, Peter and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-demos.50/",
doi = "10.18653/v1/2025.emnlp-demos.50",
pages = "678--687",
ISBN = "979-8-89176-334-0",
abstract = "Understanding the relationship between training data and model behavior during pretraining is crucial, but existing workflows make this process cumbersome, fragmented, and often inaccessible to researchers. We present TokenSmith, an open-source library for interactive editing, inspection, and analysis of datasets used in Megatron-style pretraining frameworks such as GPT-NeoX, Megatron, and NVIDIA NeMo. TokenSmith supports a wide range of operations including searching, viewing, exporting, inspecting, and sampling data, all accessible through a simple user interface and a modular backend. It also enables structured editing of pretraining data without requiring changes to training code, simplifying dataset debugging, validation, and experimentation. TokenSmith is designed as a plug-and-play addition to existing large language model pretraining workflows, thereby democratizing access to production-grade dataset tooling. TokenSmith is hosted on GitHub (https://github.com/aflah02/TokenSmith), with accompanying documentation and tutorials (https://aflah02.github.io/TokenSmith/). A demonstration video is also available on YouTube (https://www.youtube.com/watch?v=cDO8VE9fZvU)"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khan-etal-2025-tokensmith">
<titleInfo>
<title>TokenSmith: Streamlining Data Editing, Search, and Inspection for Large-Scale Language Model Training and Interpretability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Aflah</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ameya</namePart>
<namePart type="family">Godbole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="given">Yixiang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Flemings</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishna</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Gummadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willie</namePart>
<namePart type="family">Neiswanger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Schulam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-334-0</identifier>
</relatedItem>
<abstract>Understanding the relationship between training data and model behavior during pretraining is crucial, but existing workflows make this process cumbersome, fragmented, and often inaccessible to researchers. We present TokenSmith, an open-source library for interactive editing, inspection, and analysis of datasets used in Megatron-style pretraining frameworks such as GPT-NeoX, Megatron, and NVIDIA NeMo. TokenSmith supports a wide range of operations including searching, viewing, exporting, inspecting, and sampling data, all accessible through a simple user interface and a modular backend. It also enables structured editing of pretraining data without requiring changes to training code, simplifying dataset debugging, validation, and experimentation. TokenSmith is designed as a plug-and-play addition to existing large language model pretraining workflows, thereby democratizing access to production-grade dataset tooling. TokenSmith is hosted on GitHub (https://github.com/aflah02/TokenSmith), with accompanying documentation and tutorials (https://aflah02.github.io/TokenSmith/). A demonstration video is also available on YouTube (https://www.youtube.com/watch?v=cDO8VE9fZvU)</abstract>
<identifier type="citekey">khan-etal-2025-tokensmith</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-demos.50</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-demos.50/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>678</start>
<end>687</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TokenSmith: Streamlining Data Editing, Search, and Inspection for Large-Scale Language Model Training and Interpretability
%A Khan, Mohammad Aflah
%A Godbole, Ameya
%A Wei, Johnny
%A Wang, Ryan Yixiang
%A Flemings, James
%A Gummadi, Krishna P.
%A Neiswanger, Willie
%A Jia, Robin
%Y Habernal, Ivan
%Y Schulam, Peter
%Y Tiedemann, Jörg
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-334-0
%F khan-etal-2025-tokensmith
%X Understanding the relationship between training data and model behavior during pretraining is crucial, but existing workflows make this process cumbersome, fragmented, and often inaccessible to researchers. We present TokenSmith, an open-source library for interactive editing, inspection, and analysis of datasets used in Megatron-style pretraining frameworks such as GPT-NeoX, Megatron, and NVIDIA NeMo. TokenSmith supports a wide range of operations including searching, viewing, exporting, inspecting, and sampling data, all accessible through a simple user interface and a modular backend. It also enables structured editing of pretraining data without requiring changes to training code, simplifying dataset debugging, validation, and experimentation. TokenSmith is designed as a plug-and-play addition to existing large language model pretraining workflows, thereby democratizing access to production-grade dataset tooling. TokenSmith is hosted on GitHub (https://github.com/aflah02/TokenSmith), with accompanying documentation and tutorials (https://aflah02.github.io/TokenSmith/). A demonstration video is also available on YouTube (https://www.youtube.com/watch?v=cDO8VE9fZvU)
%R 10.18653/v1/2025.emnlp-demos.50
%U https://aclanthology.org/2025.emnlp-demos.50/
%U https://doi.org/10.18653/v1/2025.emnlp-demos.50
%P 678-687
Markdown (Informal)
[TokenSmith: Streamlining Data Editing, Search, and Inspection for Large-Scale Language Model Training and Interpretability](https://aclanthology.org/2025.emnlp-demos.50/) (Khan et al., EMNLP 2025)
ACL
- Mohammad Aflah Khan, Ameya Godbole, Johnny Wei, Ryan Yixiang Wang, James Flemings, Krishna P. Gummadi, Willie Neiswanger, and Robin Jia. 2025. TokenSmith: Streamlining Data Editing, Search, and Inspection for Large-Scale Language Model Training and Interpretability. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 678–687, Suzhou, China. Association for Computational Linguistics.