@inproceedings{park-etal-2025-dataverse,
title = "Dataverse: Open-Source {ETL} (Extract, Transform, Load) Pipeline for Large Language Models",
author = "Park, Hyunbyung and
Lee, Sukyung and
Gim, Gyoungjin and
Kim, Yungi and
Kim, Dahyun and
Park, Chanjun",
editor = "Dziri, Nouha and
Ren, Sean (Xiang) and
Diao, Shizhe",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-demo.1/",
doi = "10.18653/v1/2025.naacl-demo.1",
pages = "1--10",
ISBN = "979-8-89176-191-9",
abstract = "To address the challenges associated with data processing at scale, we propose Dataverse, a unified open-source Extract-Transform-Load (ETL) pipeline for large language models (LLMs) with a user-friendly design at its core. Easy addition of custom processors with block-based interface in Dataverse allows users to readily and efficiently use Dataverse to build their own ETL pipeline. We hope that Dataverse will serve as a vital tool for LLM development and open source the entire library to welcome community contribution. Additionally, we provide a concise, two-minute video demonstration of our system, illustrating its capabilities and implementation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="park-etal-2025-dataverse">
<titleInfo>
<title>Dataverse: Open-Source ETL (Extract, Transform, Load) Pipeline for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyunbyung</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sukyung</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gyoungjin</namePart>
<namePart type="family">Gim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yungi</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dahyun</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chanjun</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nouha</namePart>
<namePart type="family">Dziri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="given">(Xiang)</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhe</namePart>
<namePart type="family">Diao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-191-9</identifier>
</relatedItem>
<abstract>To address the challenges associated with data processing at scale, we propose Dataverse, a unified open-source Extract-Transform-Load (ETL) pipeline for large language models (LLMs) with a user-friendly design at its core. Easy addition of custom processors with block-based interface in Dataverse allows users to readily and efficiently use Dataverse to build their own ETL pipeline. We hope that Dataverse will serve as a vital tool for LLM development and open source the entire library to welcome community contribution. Additionally, we provide a concise, two-minute video demonstration of our system, illustrating its capabilities and implementation.</abstract>
<identifier type="citekey">park-etal-2025-dataverse</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-demo.1</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-demo.1/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1</start>
<end>10</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dataverse: Open-Source ETL (Extract, Transform, Load) Pipeline for Large Language Models
%A Park, Hyunbyung
%A Lee, Sukyung
%A Gim, Gyoungjin
%A Kim, Yungi
%A Kim, Dahyun
%A Park, Chanjun
%Y Dziri, Nouha
%Y Ren, Sean (Xiang)
%Y Diao, Shizhe
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-191-9
%F park-etal-2025-dataverse
%X To address the challenges associated with data processing at scale, we propose Dataverse, a unified open-source Extract-Transform-Load (ETL) pipeline for large language models (LLMs) with a user-friendly design at its core. Easy addition of custom processors with block-based interface in Dataverse allows users to readily and efficiently use Dataverse to build their own ETL pipeline. We hope that Dataverse will serve as a vital tool for LLM development and open source the entire library to welcome community contribution. Additionally, we provide a concise, two-minute video demonstration of our system, illustrating its capabilities and implementation.
%R 10.18653/v1/2025.naacl-demo.1
%U https://aclanthology.org/2025.naacl-demo.1/
%U https://doi.org/10.18653/v1/2025.naacl-demo.1
%P 1-10
Markdown (Informal)
[Dataverse: Open-Source ETL (Extract, Transform, Load) Pipeline for Large Language Models](https://aclanthology.org/2025.naacl-demo.1/) (Park et al., NAACL 2025)
ACL
- Hyunbyung Park, Sukyung Lee, Gyoungjin Gim, Yungi Kim, Dahyun Kim, and Chanjun Park. 2025. Dataverse: Open-Source ETL (Extract, Transform, Load) Pipeline for Large Language Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations), pages 1–10, Albuquerque, New Mexico. Association for Computational Linguistics.