@inproceedings{takahashi-ishihara-2025-quantifying,
title = "Quantifying Memorization in Continual Pre-training with {J}apanese General or Industry-Specific Corpora",
author = "Takahashi, Hiromu and
Ishihara, Shotaro",
editor = "Jia, Robin and
Wallace, Eric and
Huang, Yangsibo and
Pimentel, Tiago and
Maini, Pratyush and
Dankers, Verna and
Wei, Johnny and
Lesci, Pietro",
booktitle = "Proceedings of the First Workshop on Large Language Model Memorization (L2M2)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.l2m2-1.8/",
doi = "10.18653/v1/2025.l2m2-1.8",
pages = "95--105",
ISBN = "979-8-89176-278-7",
abstract = "Despite the growing concern about memorization of training data using large language models (LLMs), there has been insufficient analysis under conditions using non-English or industry-specific corpora.This study focuses on continual pre-training, a common approach in building non-English LLMs, and quantifies memorization of training data.Specifically, we trained two models based on Llama 3 using Japanese Wikipedia (general) and Japanese financial news articles (industry-specific).Experiments showed a tendency for the amount of memorization to increase as training progressed, similar to the empirical findings for English.This trend was clear in the industry-specific corpus, suggesting potential risks when using valuable, non-general industry corpora.We also identified issues specific to Japanese, and emphasized the importance of analysis other than in English."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takahashi-ishihara-2025-quantifying">
<titleInfo>
<title>Quantifying Memorization in Continual Pre-training with Japanese General or Industry-Specific Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hiromu</namePart>
<namePart type="family">Takahashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shotaro</namePart>
<namePart type="family">Ishihara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Large Language Model Memorization (L2M2)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Wallace</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangsibo</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Maini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verna</namePart>
<namePart type="family">Dankers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pietro</namePart>
<namePart type="family">Lesci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-278-7</identifier>
</relatedItem>
<abstract>Despite the growing concern about memorization of training data using large language models (LLMs), there has been insufficient analysis under conditions using non-English or industry-specific corpora.This study focuses on continual pre-training, a common approach in building non-English LLMs, and quantifies memorization of training data.Specifically, we trained two models based on Llama 3 using Japanese Wikipedia (general) and Japanese financial news articles (industry-specific).Experiments showed a tendency for the amount of memorization to increase as training progressed, similar to the empirical findings for English.This trend was clear in the industry-specific corpus, suggesting potential risks when using valuable, non-general industry corpora.We also identified issues specific to Japanese, and emphasized the importance of analysis other than in English.</abstract>
<identifier type="citekey">takahashi-ishihara-2025-quantifying</identifier>
<identifier type="doi">10.18653/v1/2025.l2m2-1.8</identifier>
<location>
<url>https://aclanthology.org/2025.l2m2-1.8/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>95</start>
<end>105</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quantifying Memorization in Continual Pre-training with Japanese General or Industry-Specific Corpora
%A Takahashi, Hiromu
%A Ishihara, Shotaro
%Y Jia, Robin
%Y Wallace, Eric
%Y Huang, Yangsibo
%Y Pimentel, Tiago
%Y Maini, Pratyush
%Y Dankers, Verna
%Y Wei, Johnny
%Y Lesci, Pietro
%S Proceedings of the First Workshop on Large Language Model Memorization (L2M2)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-278-7
%F takahashi-ishihara-2025-quantifying
%X Despite the growing concern about memorization of training data using large language models (LLMs), there has been insufficient analysis under conditions using non-English or industry-specific corpora.This study focuses on continual pre-training, a common approach in building non-English LLMs, and quantifies memorization of training data.Specifically, we trained two models based on Llama 3 using Japanese Wikipedia (general) and Japanese financial news articles (industry-specific).Experiments showed a tendency for the amount of memorization to increase as training progressed, similar to the empirical findings for English.This trend was clear in the industry-specific corpus, suggesting potential risks when using valuable, non-general industry corpora.We also identified issues specific to Japanese, and emphasized the importance of analysis other than in English.
%R 10.18653/v1/2025.l2m2-1.8
%U https://aclanthology.org/2025.l2m2-1.8/
%U https://doi.org/10.18653/v1/2025.l2m2-1.8
%P 95-105
Markdown (Informal)
[Quantifying Memorization in Continual Pre-training with Japanese General or Industry-Specific Corpora](https://aclanthology.org/2025.l2m2-1.8/) (Takahashi & Ishihara, L2M2 2025)
ACL