@article{min-etal-2025-rethinking,
title = "Rethinking Data Use in Large Language Models",
author = "Min, Sewon and
Hajishirzi, Hannaneh and
Zettlemoyer, Luke",
journal = "Computational Linguistics",
volume = "51",
number = "4",
month = dec,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-4.1/",
doi = "10.1162/coli.a.573",
pages = "1033--1118",
abstract = "Large language models (LMs) such as ChatGPT have revolutionized natural language processing and artificial intelligence more broadly. In this work, we discuss our research on understanding and advancing these models, centered around how they use the very large text corpora they are trained on. First, we describe our efforts to understand how these models learn to perform new tasks after training, demonstrating that their so-called in-context learning capabilities are almost entirely determined by what they learn from the training data. Next, we introduce a new class of LMs{---}nonparametric LMs{---}that repurpose this training data as a data store from which they retrieve information for improved accuracy and updatability. We discuss our work establishing the foundations of such models, including one of the first broadly used neural retrieval models and an approach that simplifies a traditional, two-stage pipeline into one. We also discuss how nonparametric models open up new avenues for responsible data use, e.g., by segregating permissive and copyrighted text and using them differently. Finally, we envision the next generation of LMs we should build, focusing on efficient scaling, improved factuality, and decentralization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="min-etal-2025-rethinking">
<titleInfo>
<title>Rethinking Data Use in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sewon</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannaneh</namePart>
<namePart type="family">Hajishirzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Large language models (LMs) such as ChatGPT have revolutionized natural language processing and artificial intelligence more broadly. In this work, we discuss our research on understanding and advancing these models, centered around how they use the very large text corpora they are trained on. First, we describe our efforts to understand how these models learn to perform new tasks after training, demonstrating that their so-called in-context learning capabilities are almost entirely determined by what they learn from the training data. Next, we introduce a new class of LMs—nonparametric LMs—that repurpose this training data as a data store from which they retrieve information for improved accuracy and updatability. We discuss our work establishing the foundations of such models, including one of the first broadly used neural retrieval models and an approach that simplifies a traditional, two-stage pipeline into one. We also discuss how nonparametric models open up new avenues for responsible data use, e.g., by segregating permissive and copyrighted text and using them differently. Finally, we envision the next generation of LMs we should build, focusing on efficient scaling, improved factuality, and decentralization.</abstract>
<identifier type="citekey">min-etal-2025-rethinking</identifier>
<identifier type="doi">10.1162/coli.a.573</identifier>
<location>
<url>https://aclanthology.org/2025.cl-4.1/</url>
</location>
<part>
<date>2025-12</date>
<detail type="volume"><number>51</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>1033</start>
<end>1118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Rethinking Data Use in Large Language Models
%A Min, Sewon
%A Hajishirzi, Hannaneh
%A Zettlemoyer, Luke
%J Computational Linguistics
%D 2025
%8 December
%V 51
%N 4
%I MIT Press
%C Cambridge, MA
%F min-etal-2025-rethinking
%X Large language models (LMs) such as ChatGPT have revolutionized natural language processing and artificial intelligence more broadly. In this work, we discuss our research on understanding and advancing these models, centered around how they use the very large text corpora they are trained on. First, we describe our efforts to understand how these models learn to perform new tasks after training, demonstrating that their so-called in-context learning capabilities are almost entirely determined by what they learn from the training data. Next, we introduce a new class of LMs—nonparametric LMs—that repurpose this training data as a data store from which they retrieve information for improved accuracy and updatability. We discuss our work establishing the foundations of such models, including one of the first broadly used neural retrieval models and an approach that simplifies a traditional, two-stage pipeline into one. We also discuss how nonparametric models open up new avenues for responsible data use, e.g., by segregating permissive and copyrighted text and using them differently. Finally, we envision the next generation of LMs we should build, focusing on efficient scaling, improved factuality, and decentralization.
%R 10.1162/coli.a.573
%U https://aclanthology.org/2025.cl-4.1/
%U https://doi.org/10.1162/coli.a.573
%P 1033-1118
Markdown (Informal)
[Rethinking Data Use in Large Language Models](https://aclanthology.org/2025.cl-4.1/) (Min et al., CL 2025)
ACL