@inproceedings{lerner-yvon-2025-unlike,
title = "Unlike {\textquotedblleft}Likely{\textquotedblright}, {\textquotedblleft}Unlike{\textquotedblright} is Unlikely: {BPE}-based Segmentation hurts Morphological Derivations in {LLM}s",
author = "Lerner, Paul and
Yvon, Fran{\c{c}}ois",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.348/",
pages = "5181--5190",
abstract = "Large Language Models (LLMs) rely on subword vocabularies to process and generate text. However, because subwords are marked as initial- or intra-word, we find that LLMs perform poorly at handling some types of affixations, which hinders their ability to generate novel (unobserved) word forms. The largest models trained on enough data can mitigate this tendency because their initial- and intra-word embeddings are aligned; in-context learning also helps when all examples are selected in a consistent way; but only morphological segmentation can achieve a near-perfect accuracy."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lerner-yvon-2025-unlike">
<titleInfo>
<title>Unlike “Likely”, “Unlike” is Unlikely: BPE-based Segmentation hurts Morphological Derivations in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Lerner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Yvon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) rely on subword vocabularies to process and generate text. However, because subwords are marked as initial- or intra-word, we find that LLMs perform poorly at handling some types of affixations, which hinders their ability to generate novel (unobserved) word forms. The largest models trained on enough data can mitigate this tendency because their initial- and intra-word embeddings are aligned; in-context learning also helps when all examples are selected in a consistent way; but only morphological segmentation can achieve a near-perfect accuracy.</abstract>
<identifier type="citekey">lerner-yvon-2025-unlike</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.348/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5181</start>
<end>5190</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unlike “Likely”, “Unlike” is Unlikely: BPE-based Segmentation hurts Morphological Derivations in LLMs
%A Lerner, Paul
%A Yvon, François
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F lerner-yvon-2025-unlike
%X Large Language Models (LLMs) rely on subword vocabularies to process and generate text. However, because subwords are marked as initial- or intra-word, we find that LLMs perform poorly at handling some types of affixations, which hinders their ability to generate novel (unobserved) word forms. The largest models trained on enough data can mitigate this tendency because their initial- and intra-word embeddings are aligned; in-context learning also helps when all examples are selected in a consistent way; but only morphological segmentation can achieve a near-perfect accuracy.
%U https://aclanthology.org/2025.coling-main.348/
%P 5181-5190
Markdown (Informal)
[Unlike “Likely”, “Unlike” is Unlikely: BPE-based Segmentation hurts Morphological Derivations in LLMs](https://aclanthology.org/2025.coling-main.348/) (Lerner & Yvon, COLING 2025)
ACL