@inproceedings{bolucu-can-2025-morpheme,
title = "A Morpheme-Aware Child-Inspired Language Model",
author = {B{\"o}l{\"u}c{\"u}, Necva and
Can, Burcu},
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.babylm-main.21/",
pages = "279--287",
ISBN = "TODO",
abstract = "Most tokenization methods in language models rely on subword units that lack explicit linguistic correspondence. In this work, we investigate the impact of using morpheme-based tokens in a small language model, comparing them to the widely used frequency-based method, BPE. We apply the morpheme-based tokenization method to both 10-million and 100-million word datasets from the BabyLM Challenge. Our results show that using a morphological tokenizer improves EWoK (basic world knowledge) performance by around 20{\%} and entity tracking by around 40{\%}, highlighting the impact of morphological information in developing smaller language models. We also apply curriculum learning, in which morphological information is gradually introduced during training, mirroring the vocabulary-building stage in infants that precedes morphological processing. The results are consistent with previous research: curriculum learning yields slight improvements for some tasks, but performance degradation in others."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bolucu-can-2025-morpheme">
<titleInfo>
<title>A Morpheme-Aware Child-Inspired Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Necva</namePart>
<namePart type="family">Bölücü</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burcu</namePart>
<namePart type="family">Can</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First BabyLM Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="family">Charpentier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustafa</namePart>
<namePart type="given">Omer</namePart>
<namePart type="family">Gul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raj</namePart>
<namePart type="given">Sanjay</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">TODO</identifier>
</relatedItem>
<abstract>Most tokenization methods in language models rely on subword units that lack explicit linguistic correspondence. In this work, we investigate the impact of using morpheme-based tokens in a small language model, comparing them to the widely used frequency-based method, BPE. We apply the morpheme-based tokenization method to both 10-million and 100-million word datasets from the BabyLM Challenge. Our results show that using a morphological tokenizer improves EWoK (basic world knowledge) performance by around 20% and entity tracking by around 40%, highlighting the impact of morphological information in developing smaller language models. We also apply curriculum learning, in which morphological information is gradually introduced during training, mirroring the vocabulary-building stage in infants that precedes morphological processing. The results are consistent with previous research: curriculum learning yields slight improvements for some tasks, but performance degradation in others.</abstract>
<identifier type="citekey">bolucu-can-2025-morpheme</identifier>
<location>
<url>https://aclanthology.org/2025.babylm-main.21/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>279</start>
<end>287</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Morpheme-Aware Child-Inspired Language Model
%A Bölücü, Necva
%A Can, Burcu
%Y Charpentier, Lucas
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Gul, Mustafa Omer
%Y Hu, Michael Y.
%Y Liu, Jing
%Y Jumelet, Jaap
%Y Linzen, Tal
%Y Mueller, Aaron
%Y Ross, Candace
%Y Shah, Raj Sanjay
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%Y Williams, Adina
%S Proceedings of the First BabyLM Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ TODO
%F bolucu-can-2025-morpheme
%X Most tokenization methods in language models rely on subword units that lack explicit linguistic correspondence. In this work, we investigate the impact of using morpheme-based tokens in a small language model, comparing them to the widely used frequency-based method, BPE. We apply the morpheme-based tokenization method to both 10-million and 100-million word datasets from the BabyLM Challenge. Our results show that using a morphological tokenizer improves EWoK (basic world knowledge) performance by around 20% and entity tracking by around 40%, highlighting the impact of morphological information in developing smaller language models. We also apply curriculum learning, in which morphological information is gradually introduced during training, mirroring the vocabulary-building stage in infants that precedes morphological processing. The results are consistent with previous research: curriculum learning yields slight improvements for some tasks, but performance degradation in others.
%U https://aclanthology.org/2025.babylm-main.21/
%P 279-287
Markdown (Informal)
[A Morpheme-Aware Child-Inspired Language Model](https://aclanthology.org/2025.babylm-main.21/) (Bölücü & Can, BabyLM 2025)
ACL