@inproceedings{ciaccio-etal-2025-beyond,
title = "Beyond the Spelling Miracle: Investigating Substring Awareness in Character-Blind Language Models",
author = "Ciaccio, Cristiano and
Sartor, Marta and
Miaschi, Alessio and
Dell{'}Orletta, Felice",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.593/",
doi = "10.18653/v1/2025.findings-acl.593",
pages = "11361--11372",
ISBN = "979-8-89176-256-5",
abstract = "Correctly identifying characters and substrings of words should be a basic but essential ability of any Language Model that aims to proficiently understand and produce language. Despite so, the majority of Pre-trained Language Models (PLMs) are ``character-blind'' and struggle in spelling tasks, although they still seem to acquire some character knowledge during pre-training, a phenomenon dubbed Spelling Miracle. To shed light on this phenomenon, we systematically evaluate a range of PLMs with different parameter sizes using a controlled binary substring identification task. Through a series of experiments, we propose the first comprehensive investigation on where, when, and how a PLMs develop awareness of characters and substrings, with a particular linguistic focus on morphemic units such as prefixes, suffixes, and roots."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ciaccio-etal-2025-beyond">
<titleInfo>
<title>Beyond the Spelling Miracle: Investigating Substring Awareness in Character-Blind Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cristiano</namePart>
<namePart type="family">Ciaccio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="family">Sartor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessio</namePart>
<namePart type="family">Miaschi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Correctly identifying characters and substrings of words should be a basic but essential ability of any Language Model that aims to proficiently understand and produce language. Despite so, the majority of Pre-trained Language Models (PLMs) are “character-blind” and struggle in spelling tasks, although they still seem to acquire some character knowledge during pre-training, a phenomenon dubbed Spelling Miracle. To shed light on this phenomenon, we systematically evaluate a range of PLMs with different parameter sizes using a controlled binary substring identification task. Through a series of experiments, we propose the first comprehensive investigation on where, when, and how a PLMs develop awareness of characters and substrings, with a particular linguistic focus on morphemic units such as prefixes, suffixes, and roots.</abstract>
<identifier type="citekey">ciaccio-etal-2025-beyond</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.593</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.593/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11361</start>
<end>11372</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond the Spelling Miracle: Investigating Substring Awareness in Character-Blind Language Models
%A Ciaccio, Cristiano
%A Sartor, Marta
%A Miaschi, Alessio
%A Dell’Orletta, Felice
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F ciaccio-etal-2025-beyond
%X Correctly identifying characters and substrings of words should be a basic but essential ability of any Language Model that aims to proficiently understand and produce language. Despite so, the majority of Pre-trained Language Models (PLMs) are “character-blind” and struggle in spelling tasks, although they still seem to acquire some character knowledge during pre-training, a phenomenon dubbed Spelling Miracle. To shed light on this phenomenon, we systematically evaluate a range of PLMs with different parameter sizes using a controlled binary substring identification task. Through a series of experiments, we propose the first comprehensive investigation on where, when, and how a PLMs develop awareness of characters and substrings, with a particular linguistic focus on morphemic units such as prefixes, suffixes, and roots.
%R 10.18653/v1/2025.findings-acl.593
%U https://aclanthology.org/2025.findings-acl.593/
%U https://doi.org/10.18653/v1/2025.findings-acl.593
%P 11361-11372
Markdown (Informal)
[Beyond the Spelling Miracle: Investigating Substring Awareness in Character-Blind Language Models](https://aclanthology.org/2025.findings-acl.593/) (Ciaccio et al., Findings 2025)
ACL