@inproceedings{boldt-mortensen-2025-morpheme,
title = "Morpheme Induction for Emergent Language",
author = "Boldt, Brendon and
Mortensen, David R.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1284/",
pages = "25275--25290",
ISBN = "979-8-89176-332-6",
abstract = "We introduce CSAR, an algorithm for inducing morphemes from emergent language corpora of parallel utterances and meanings.It is a greedy algorithm that (1) weights morphemes based on mutual information between forms and meanings, (2) selects the highest-weighted pair, (3) removes it from the corpus, and (4) repeats the process to induce further morphemes (i.e., Count, Select, Ablate, Repeat).The effectiveness of CSAR is first validated on procedurally generated datasets and compared against baselines for related tasks.Second, we validate CSAR{'}s performance on human language data to show that the algorithm makes reasonable predictions in adjacent domains.Finally, we analyze a handful of emergent languages, quantifying linguistic characteristics like degree of synonymy and polysemy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="boldt-mortensen-2025-morpheme">
<titleInfo>
<title>Morpheme Induction for Emergent Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brendon</namePart>
<namePart type="family">Boldt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Mortensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>We introduce CSAR, an algorithm for inducing morphemes from emergent language corpora of parallel utterances and meanings.It is a greedy algorithm that (1) weights morphemes based on mutual information between forms and meanings, (2) selects the highest-weighted pair, (3) removes it from the corpus, and (4) repeats the process to induce further morphemes (i.e., Count, Select, Ablate, Repeat).The effectiveness of CSAR is first validated on procedurally generated datasets and compared against baselines for related tasks.Second, we validate CSAR’s performance on human language data to show that the algorithm makes reasonable predictions in adjacent domains.Finally, we analyze a handful of emergent languages, quantifying linguistic characteristics like degree of synonymy and polysemy.</abstract>
<identifier type="citekey">boldt-mortensen-2025-morpheme</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1284/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25275</start>
<end>25290</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Morpheme Induction for Emergent Language
%A Boldt, Brendon
%A Mortensen, David R.
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F boldt-mortensen-2025-morpheme
%X We introduce CSAR, an algorithm for inducing morphemes from emergent language corpora of parallel utterances and meanings.It is a greedy algorithm that (1) weights morphemes based on mutual information between forms and meanings, (2) selects the highest-weighted pair, (3) removes it from the corpus, and (4) repeats the process to induce further morphemes (i.e., Count, Select, Ablate, Repeat).The effectiveness of CSAR is first validated on procedurally generated datasets and compared against baselines for related tasks.Second, we validate CSAR’s performance on human language data to show that the algorithm makes reasonable predictions in adjacent domains.Finally, we analyze a handful of emergent languages, quantifying linguistic characteristics like degree of synonymy and polysemy.
%U https://aclanthology.org/2025.emnlp-main.1284/
%P 25275-25290
Markdown (Informal)
[Morpheme Induction for Emergent Language](https://aclanthology.org/2025.emnlp-main.1284/) (Boldt & Mortensen, EMNLP 2025)
ACL
- Brendon Boldt and David R. Mortensen. 2025. Morpheme Induction for Emergent Language. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 25275–25290, Suzhou, China. Association for Computational Linguistics.