@inproceedings{goriely-buttery-2025-babylms,
title = "{B}aby{LM}{'}s First Words: Word Segmentation as a Phonological Probing Task",
author = "Goriely, Zebulon and
Buttery, Paula",
editor = "Boleda, Gemma and
Roth, Michael",
booktitle = "Proceedings of the 29th Conference on Computational Natural Language Learning",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.conll-1.34/",
doi = "10.18653/v1/2025.conll-1.34",
pages = "522--539",
ISBN = "979-8-89176-271-8",
abstract = "Language models provide a key framework for studying linguistic theories based on prediction, but phonological analysis using large language models (LLMs) is difficult; there are few phonological benchmarks beyond English and the standard input representation used in LLMs (subwords of graphemes) is not suitable for analyzing the representation of phonemes. In this work, we demonstrate how word segmentation can be used as a phonological probing task, allowing us to study the representations learned by phoneme-based language models trained on child-directed speech across 31 languages. Following computational models of word segmentation, we present unsupervised methods for extracting word boundaries from a trained model using the observation that prediction-error peaks at the start of words. We also use linear probes to identify that these models implicitly track word boundaries, even when they do not appear in training. This cross-lingual work corroborates statistical learning theories of acquisition and empirically motivates new methods for training subword tokenizers."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goriely-buttery-2025-babylms">
<titleInfo>
<title>BabyLM’s First Words: Word Segmentation as a Phonological Probing Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zebulon</namePart>
<namePart type="family">Goriely</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paula</namePart>
<namePart type="family">Buttery</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gemma</namePart>
<namePart type="family">Boleda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-271-8</identifier>
</relatedItem>
<abstract>Language models provide a key framework for studying linguistic theories based on prediction, but phonological analysis using large language models (LLMs) is difficult; there are few phonological benchmarks beyond English and the standard input representation used in LLMs (subwords of graphemes) is not suitable for analyzing the representation of phonemes. In this work, we demonstrate how word segmentation can be used as a phonological probing task, allowing us to study the representations learned by phoneme-based language models trained on child-directed speech across 31 languages. Following computational models of word segmentation, we present unsupervised methods for extracting word boundaries from a trained model using the observation that prediction-error peaks at the start of words. We also use linear probes to identify that these models implicitly track word boundaries, even when they do not appear in training. This cross-lingual work corroborates statistical learning theories of acquisition and empirically motivates new methods for training subword tokenizers.</abstract>
<identifier type="citekey">goriely-buttery-2025-babylms</identifier>
<identifier type="doi">10.18653/v1/2025.conll-1.34</identifier>
<location>
<url>https://aclanthology.org/2025.conll-1.34/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>522</start>
<end>539</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BabyLM’s First Words: Word Segmentation as a Phonological Probing Task
%A Goriely, Zebulon
%A Buttery, Paula
%Y Boleda, Gemma
%Y Roth, Michael
%S Proceedings of the 29th Conference on Computational Natural Language Learning
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-271-8
%F goriely-buttery-2025-babylms
%X Language models provide a key framework for studying linguistic theories based on prediction, but phonological analysis using large language models (LLMs) is difficult; there are few phonological benchmarks beyond English and the standard input representation used in LLMs (subwords of graphemes) is not suitable for analyzing the representation of phonemes. In this work, we demonstrate how word segmentation can be used as a phonological probing task, allowing us to study the representations learned by phoneme-based language models trained on child-directed speech across 31 languages. Following computational models of word segmentation, we present unsupervised methods for extracting word boundaries from a trained model using the observation that prediction-error peaks at the start of words. We also use linear probes to identify that these models implicitly track word boundaries, even when they do not appear in training. This cross-lingual work corroborates statistical learning theories of acquisition and empirically motivates new methods for training subword tokenizers.
%R 10.18653/v1/2025.conll-1.34
%U https://aclanthology.org/2025.conll-1.34/
%U https://doi.org/10.18653/v1/2025.conll-1.34
%P 522-539
Markdown (Informal)
[BabyLM’s First Words: Word Segmentation as a Phonological Probing Task](https://aclanthology.org/2025.conll-1.34/) (Goriely & Buttery, CoNLL 2025)
ACL