@article{kojima-etal-2025-continual,
title = "Continual Pre-training on Character-level Noisy Texts Makes Decoder-based Language Models Robust Few-shot Learners",
author = "Kojima, Takeshi and
Matsuo, Yutaka and
Iwasawa, Yusuke",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.tacl-1.38/",
doi = "10.1162/tacl.a.21",
pages = "831--847",
abstract = "Recent decoder-based pre-trained language models (PLMs) generally use subword tokenizers. However, adding character-level perturbations drastically changes the delimitation of texts by the tokenizers, leading to the vulnerability of PLMs. This study proposes a method of continual pre-training to convert decoder-based PLMs with subword tokenizers into perturbation-robust few-shot in-context learners. Our method continually trains decoder-based PLMs to predict the next tokens conditioning on artificially created character-level noisy texts. Since decoder-based language models are auto-regressive, we skip noised words from the target optimization. In addition, to maintain the same word prediction performance under noisy text as clean text, our method employs word distribution matching between the original PLMs and training models. We conducted experiments on various subword-based PLMs, including GPT2, Pythia, Mistral, Gemma2, and Llama3, ranging from 1B to 8B parameters. The results demonstrate that our method consistently improves the performance of few-shot in-context learning on downstream tasks which contain actual typos or misspellings as well as artificial noise.1"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kojima-etal-2025-continual">
<titleInfo>
<title>Continual Pre-training on Character-level Noisy Texts Makes Decoder-based Language Models Robust Few-shot Learners</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takeshi</namePart>
<namePart type="family">Kojima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutaka</namePart>
<namePart type="family">Matsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Iwasawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Recent decoder-based pre-trained language models (PLMs) generally use subword tokenizers. However, adding character-level perturbations drastically changes the delimitation of texts by the tokenizers, leading to the vulnerability of PLMs. This study proposes a method of continual pre-training to convert decoder-based PLMs with subword tokenizers into perturbation-robust few-shot in-context learners. Our method continually trains decoder-based PLMs to predict the next tokens conditioning on artificially created character-level noisy texts. Since decoder-based language models are auto-regressive, we skip noised words from the target optimization. In addition, to maintain the same word prediction performance under noisy text as clean text, our method employs word distribution matching between the original PLMs and training models. We conducted experiments on various subword-based PLMs, including GPT2, Pythia, Mistral, Gemma2, and Llama3, ranging from 1B to 8B parameters. The results demonstrate that our method consistently improves the performance of few-shot in-context learning on downstream tasks which contain actual typos or misspellings as well as artificial noise.1</abstract>
<identifier type="citekey">kojima-etal-2025-continual</identifier>
<identifier type="doi">10.1162/tacl.a.21</identifier>
<location>
<url>https://aclanthology.org/2025.tacl-1.38/</url>
</location>
<part>
<date>2025</date>
<detail type="volume"><number>13</number></detail>
<extent unit="page">
<start>831</start>
<end>847</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Continual Pre-training on Character-level Noisy Texts Makes Decoder-based Language Models Robust Few-shot Learners
%A Kojima, Takeshi
%A Matsuo, Yutaka
%A Iwasawa, Yusuke
%J Transactions of the Association for Computational Linguistics
%D 2025
%V 13
%I MIT Press
%C Cambridge, MA
%F kojima-etal-2025-continual
%X Recent decoder-based pre-trained language models (PLMs) generally use subword tokenizers. However, adding character-level perturbations drastically changes the delimitation of texts by the tokenizers, leading to the vulnerability of PLMs. This study proposes a method of continual pre-training to convert decoder-based PLMs with subword tokenizers into perturbation-robust few-shot in-context learners. Our method continually trains decoder-based PLMs to predict the next tokens conditioning on artificially created character-level noisy texts. Since decoder-based language models are auto-regressive, we skip noised words from the target optimization. In addition, to maintain the same word prediction performance under noisy text as clean text, our method employs word distribution matching between the original PLMs and training models. We conducted experiments on various subword-based PLMs, including GPT2, Pythia, Mistral, Gemma2, and Llama3, ranging from 1B to 8B parameters. The results demonstrate that our method consistently improves the performance of few-shot in-context learning on downstream tasks which contain actual typos or misspellings as well as artificial noise.1
%R 10.1162/tacl.a.21
%U https://aclanthology.org/2025.tacl-1.38/
%U https://doi.org/10.1162/tacl.a.21
%P 831-847
Markdown (Informal)
[Continual Pre-training on Character-level Noisy Texts Makes Decoder-based Language Models Robust Few-shot Learners](https://aclanthology.org/2025.tacl-1.38/) (Kojima et al., TACL 2025)
ACL