@inproceedings{nikolaev-etal-2025-case,
title = "Case{--}Number Dissociation in {F}innish Noun Embeddings:fast{T}ext vs. {BERT} Layer Effects",
author = "Nikolaev, Alexandre and
Chuang, Yu-Ying and
Baayen, R. Harald",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Rie{\ss}ler, Michael and
Morooka, Eiaki V. and
Kharlashkin, Lev},
booktitle = "Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages",
month = dec,
year = "2025",
address = "Joensuu, Finland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwclul-1.16/",
pages = "127--130",
ISBN = "979-8-89176-360-9",
abstract = "Motivated by how inflectional morphology is encoded in modern embeddings, we revisit the 55,271 inflected forms from the 2,000 most frequent Finnish nouns analyzed by Nikolaev et al. (2022) using fastText and ask a single question: where does inflectional morphology emerge in BERT? For each form, we extract minimal-context FinBERT vectors from every layer (1{--}12) by running each word in isolation and averaging its WordPiece vectors into a single representation. Using the same generating model as in Nikolaev et al. (2022), we impute latent vectors for the stem, N UMBER, C ASE, P OSSESSIVE, and C LITIC, plus a higher-order interaction, and evaluate by rank-1 nearest correlation. Within BERT, accuracy follows an emergence curve from 67.21{\%} (layer 1) to 86.16{\%} (layer 12). The error mix shifts with depth: middle layers show a lower share of C ASE errors but a higher share of N UMBER errors, whereas the top layer reverses this tendency; clitic-only errors are rare throughout. For context, the fastText ceiling is slightly higher ({\ensuremath{\approx}}89{\%}), but our focus is the layer-resolved profile inside BERT. The result is a compact, reproducible map of Finnish noun inflection across the BERT stack, showing how different inflectional cues become recoverable at different depths (BERT layers) under an identical modeling and evaluation pipeline."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nikolaev-etal-2025-case">
<titleInfo>
<title>Case–Number Dissociation in Finnish Noun Embeddings:fastText vs. BERT Layer Effects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Nikolaev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu-Ying</namePart>
<namePart type="family">Chuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="given">Harald</namePart>
<namePart type="family">Baayen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Rießler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eiaki</namePart>
<namePart type="given">V</namePart>
<namePart type="family">Morooka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lev</namePart>
<namePart type="family">Kharlashkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Joensuu, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-360-9</identifier>
</relatedItem>
<abstract>Motivated by how inflectional morphology is encoded in modern embeddings, we revisit the 55,271 inflected forms from the 2,000 most frequent Finnish nouns analyzed by Nikolaev et al. (2022) using fastText and ask a single question: where does inflectional morphology emerge in BERT? For each form, we extract minimal-context FinBERT vectors from every layer (1–12) by running each word in isolation and averaging its WordPiece vectors into a single representation. Using the same generating model as in Nikolaev et al. (2022), we impute latent vectors for the stem, N UMBER, C ASE, P OSSESSIVE, and C LITIC, plus a higher-order interaction, and evaluate by rank-1 nearest correlation. Within BERT, accuracy follows an emergence curve from 67.21% (layer 1) to 86.16% (layer 12). The error mix shifts with depth: middle layers show a lower share of C ASE errors but a higher share of N UMBER errors, whereas the top layer reverses this tendency; clitic-only errors are rare throughout. For context, the fastText ceiling is slightly higher (\ensuremath\approx89%), but our focus is the layer-resolved profile inside BERT. The result is a compact, reproducible map of Finnish noun inflection across the BERT stack, showing how different inflectional cues become recoverable at different depths (BERT layers) under an identical modeling and evaluation pipeline.</abstract>
<identifier type="citekey">nikolaev-etal-2025-case</identifier>
<location>
<url>https://aclanthology.org/2025.iwclul-1.16/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>127</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Case–Number Dissociation in Finnish Noun Embeddings:fastText vs. BERT Layer Effects
%A Nikolaev, Alexandre
%A Chuang, Yu-Ying
%A Baayen, R. Harald
%Y Hämäläinen, Mika
%Y Rießler, Michael
%Y Morooka, Eiaki V.
%Y Kharlashkin, Lev
%S Proceedings of the 10th International Workshop on Computational Linguistics for Uralic Languages
%D 2025
%8 December
%I Association for Computational Linguistics
%C Joensuu, Finland
%@ 979-8-89176-360-9
%F nikolaev-etal-2025-case
%X Motivated by how inflectional morphology is encoded in modern embeddings, we revisit the 55,271 inflected forms from the 2,000 most frequent Finnish nouns analyzed by Nikolaev et al. (2022) using fastText and ask a single question: where does inflectional morphology emerge in BERT? For each form, we extract minimal-context FinBERT vectors from every layer (1–12) by running each word in isolation and averaging its WordPiece vectors into a single representation. Using the same generating model as in Nikolaev et al. (2022), we impute latent vectors for the stem, N UMBER, C ASE, P OSSESSIVE, and C LITIC, plus a higher-order interaction, and evaluate by rank-1 nearest correlation. Within BERT, accuracy follows an emergence curve from 67.21% (layer 1) to 86.16% (layer 12). The error mix shifts with depth: middle layers show a lower share of C ASE errors but a higher share of N UMBER errors, whereas the top layer reverses this tendency; clitic-only errors are rare throughout. For context, the fastText ceiling is slightly higher (\ensuremath\approx89%), but our focus is the layer-resolved profile inside BERT. The result is a compact, reproducible map of Finnish noun inflection across the BERT stack, showing how different inflectional cues become recoverable at different depths (BERT layers) under an identical modeling and evaluation pipeline.
%U https://aclanthology.org/2025.iwclul-1.16/
%P 127-130
Markdown (Informal)
[Case–Number Dissociation in Finnish Noun Embeddings:fastText vs. BERT Layer Effects](https://aclanthology.org/2025.iwclul-1.16/) (Nikolaev et al., IWCLUL 2025)
ACL