@inproceedings{jeon-2025-beyond,
title = "Beyond Distribution: Investigating Language Models' Understanding of {S}ino-{K}orean Morphemes",
author = "Jeon, Taehee",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.569/",
pages = "10762--10772",
ISBN = "979-8-89176-335-7",
abstract = "We investigate whether Transformer-based language models, trained solely on Hangul text, can learn the compositional morphology of Sino-Korean (SK) morphemes, which are fundamental to Korean vocabulary. Using BERT{\_}BASE and fastText, we conduct controlled experiments with target words and their ``real'' vs. ``fake'' neighbors{---}pairs that share a Hangul syllable representing the same SK morpheme vs. those that share only the Hangul syllable. Our results show that while both models{---}especially BERT{---}distinguish real and fake pairs to some extent, their performance is primarily driven by the frequency of each experimental word rather than a true understanding of SK morphemes. These findings highlight the limits of distributional learning for morpheme-level understanding and emphasize the need for explicit morphological modeling or Hanja-aware strategies to improve semantic representation in Korean language models. Our dataset and analysis code are available at: https://github.com/taeheejeon22/ko-skmorph-lm."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jeon-2025-beyond">
<titleInfo>
<title>Beyond Distribution: Investigating Language Models’ Understanding of Sino-Korean Morphemes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taehee</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>We investigate whether Transformer-based language models, trained solely on Hangul text, can learn the compositional morphology of Sino-Korean (SK) morphemes, which are fundamental to Korean vocabulary. Using BERT_BASE and fastText, we conduct controlled experiments with target words and their “real” vs. “fake” neighbors—pairs that share a Hangul syllable representing the same SK morpheme vs. those that share only the Hangul syllable. Our results show that while both models—especially BERT—distinguish real and fake pairs to some extent, their performance is primarily driven by the frequency of each experimental word rather than a true understanding of SK morphemes. These findings highlight the limits of distributional learning for morpheme-level understanding and emphasize the need for explicit morphological modeling or Hanja-aware strategies to improve semantic representation in Korean language models. Our dataset and analysis code are available at: https://github.com/taeheejeon22/ko-skmorph-lm.</abstract>
<identifier type="citekey">jeon-2025-beyond</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.569/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10762</start>
<end>10772</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Distribution: Investigating Language Models’ Understanding of Sino-Korean Morphemes
%A Jeon, Taehee
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F jeon-2025-beyond
%X We investigate whether Transformer-based language models, trained solely on Hangul text, can learn the compositional morphology of Sino-Korean (SK) morphemes, which are fundamental to Korean vocabulary. Using BERT_BASE and fastText, we conduct controlled experiments with target words and their “real” vs. “fake” neighbors—pairs that share a Hangul syllable representing the same SK morpheme vs. those that share only the Hangul syllable. Our results show that while both models—especially BERT—distinguish real and fake pairs to some extent, their performance is primarily driven by the frequency of each experimental word rather than a true understanding of SK morphemes. These findings highlight the limits of distributional learning for morpheme-level understanding and emphasize the need for explicit morphological modeling or Hanja-aware strategies to improve semantic representation in Korean language models. Our dataset and analysis code are available at: https://github.com/taeheejeon22/ko-skmorph-lm.
%U https://aclanthology.org/2025.findings-emnlp.569/
%P 10762-10772
Markdown (Informal)
[Beyond Distribution: Investigating Language Models’ Understanding of Sino-Korean Morphemes](https://aclanthology.org/2025.findings-emnlp.569/) (Jeon, Findings 2025)
ACL