@inproceedings{rama-wichmann-2018-towards,
title = "Towards identifying the optimal datasize for lexically-based {B}ayesian inference of linguistic phylogenies",
author = "Rama, Taraka and
Wichmann, S{\o}ren",
editor = "Bender, Emily M. and
Derczynski, Leon and
Isabelle, Pierre",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/C18-1134",
pages = "1578--1590",
abstract = "Bayesian linguistic phylogenies are standardly based on cognate matrices for words referring to a fix set of meanings{---}typically around 100-200. To this day there has not been any empirical investigation into which datasize is optimal. Here we determine, across a set of language families, the optimal number of meanings required for the best performance in Bayesian phylogenetic inference. We rank meanings by stability, infer phylogenetic trees using first the most stable meaning, then the two most stable meanings, and so on, computing the quartet distance of the resulting tree to the tree proposed by language family experts at each step of datasize increase. When a gold standard tree is not available we propose to instead compute the quartet distance between the tree based on the n-most stable meaning and the one based on the n + 1-most stable meanings, increasing n from 1 to N − 1, where N is the total number of meanings. The assumption here is that the value of n for which the quartet distance begins to stabilize is also the value at which the quality of the tree ceases to improve. We show that this assumption is borne out. The results of the two methods vary across families, and the optimal number of meanings appears to correlate with the number of languages under consideration.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rama-wichmann-2018-towards">
<titleInfo>
<title>Towards identifying the optimal datasize for lexically-based Bayesian inference of linguistic phylogenies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taraka</namePart>
<namePart type="family">Rama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Søren</namePart>
<namePart type="family">Wichmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Bender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Isabelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Bayesian linguistic phylogenies are standardly based on cognate matrices for words referring to a fix set of meanings—typically around 100-200. To this day there has not been any empirical investigation into which datasize is optimal. Here we determine, across a set of language families, the optimal number of meanings required for the best performance in Bayesian phylogenetic inference. We rank meanings by stability, infer phylogenetic trees using first the most stable meaning, then the two most stable meanings, and so on, computing the quartet distance of the resulting tree to the tree proposed by language family experts at each step of datasize increase. When a gold standard tree is not available we propose to instead compute the quartet distance between the tree based on the n-most stable meaning and the one based on the n + 1-most stable meanings, increasing n from 1 to N − 1, where N is the total number of meanings. The assumption here is that the value of n for which the quartet distance begins to stabilize is also the value at which the quality of the tree ceases to improve. We show that this assumption is borne out. The results of the two methods vary across families, and the optimal number of meanings appears to correlate with the number of languages under consideration.</abstract>
<identifier type="citekey">rama-wichmann-2018-towards</identifier>
<location>
<url>https://aclanthology.org/C18-1134</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>1578</start>
<end>1590</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards identifying the optimal datasize for lexically-based Bayesian inference of linguistic phylogenies
%A Rama, Taraka
%A Wichmann, Søren
%Y Bender, Emily M.
%Y Derczynski, Leon
%Y Isabelle, Pierre
%S Proceedings of the 27th International Conference on Computational Linguistics
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F rama-wichmann-2018-towards
%X Bayesian linguistic phylogenies are standardly based on cognate matrices for words referring to a fix set of meanings—typically around 100-200. To this day there has not been any empirical investigation into which datasize is optimal. Here we determine, across a set of language families, the optimal number of meanings required for the best performance in Bayesian phylogenetic inference. We rank meanings by stability, infer phylogenetic trees using first the most stable meaning, then the two most stable meanings, and so on, computing the quartet distance of the resulting tree to the tree proposed by language family experts at each step of datasize increase. When a gold standard tree is not available we propose to instead compute the quartet distance between the tree based on the n-most stable meaning and the one based on the n + 1-most stable meanings, increasing n from 1 to N − 1, where N is the total number of meanings. The assumption here is that the value of n for which the quartet distance begins to stabilize is also the value at which the quality of the tree ceases to improve. We show that this assumption is borne out. The results of the two methods vary across families, and the optimal number of meanings appears to correlate with the number of languages under consideration.
%U https://aclanthology.org/C18-1134
%P 1578-1590
Markdown (Informal)
[Towards identifying the optimal datasize for lexically-based Bayesian inference of linguistic phylogenies](https://aclanthology.org/C18-1134) (Rama & Wichmann, COLING 2018)
ACL