@article{nguyen-eisenstein-2017-kernel,
title = "A Kernel Independence Test for Geographical Language Variation",
author = "Nguyen, Dong and
Eisenstein, Jacob",
journal = "Computational Linguistics",
volume = "43",
number = "3",
month = sep,
year = "2017",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/J17-3003",
doi = "10.1162/COLI_a_00293",
pages = "567--592",
abstract = "Quantifying the degree of spatial dependence for linguistic variables is a key task for analyzing dialectal variation. However, existing approaches have important drawbacks. First, they are based on parametric models of dependence, which limits their power in cases where the underlying parametric assumptions are violated. Second, they are not applicable to all types of linguistic data: Some approaches apply only to frequencies, others to boolean indicators of whether a linguistic variable is present. We present a new method for measuring geographical language variation, which solves both of these problems. Our approach builds on Reproducing Kernel Hilbert Space (RKHS) representations for nonparametric statistics, and takes the form of a test statistic that is computed from pairs of individual geotagged observations without aggregation into predefined geographical bins. We compare this test with prior work using synthetic data as well as a diverse set of real data sets: a corpus of Dutch tweets, a Dutch syntactic atlas, and a data set of letters to the editor in North American newspapers. Our proposed test is shown to support robust inferences across a broad range of scenarios and types of data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-eisenstein-2017-kernel">
<titleInfo>
<title>A Kernel Independence Test for Geographical Language Variation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Eisenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Quantifying the degree of spatial dependence for linguistic variables is a key task for analyzing dialectal variation. However, existing approaches have important drawbacks. First, they are based on parametric models of dependence, which limits their power in cases where the underlying parametric assumptions are violated. Second, they are not applicable to all types of linguistic data: Some approaches apply only to frequencies, others to boolean indicators of whether a linguistic variable is present. We present a new method for measuring geographical language variation, which solves both of these problems. Our approach builds on Reproducing Kernel Hilbert Space (RKHS) representations for nonparametric statistics, and takes the form of a test statistic that is computed from pairs of individual geotagged observations without aggregation into predefined geographical bins. We compare this test with prior work using synthetic data as well as a diverse set of real data sets: a corpus of Dutch tweets, a Dutch syntactic atlas, and a data set of letters to the editor in North American newspapers. Our proposed test is shown to support robust inferences across a broad range of scenarios and types of data.</abstract>
<identifier type="citekey">nguyen-eisenstein-2017-kernel</identifier>
<identifier type="doi">10.1162/COLI_a_00293</identifier>
<location>
<url>https://aclanthology.org/J17-3003</url>
</location>
<part>
<date>2017-09</date>
<detail type="volume"><number>43</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>567</start>
<end>592</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T A Kernel Independence Test for Geographical Language Variation
%A Nguyen, Dong
%A Eisenstein, Jacob
%J Computational Linguistics
%D 2017
%8 September
%V 43
%N 3
%I MIT Press
%C Cambridge, MA
%F nguyen-eisenstein-2017-kernel
%X Quantifying the degree of spatial dependence for linguistic variables is a key task for analyzing dialectal variation. However, existing approaches have important drawbacks. First, they are based on parametric models of dependence, which limits their power in cases where the underlying parametric assumptions are violated. Second, they are not applicable to all types of linguistic data: Some approaches apply only to frequencies, others to boolean indicators of whether a linguistic variable is present. We present a new method for measuring geographical language variation, which solves both of these problems. Our approach builds on Reproducing Kernel Hilbert Space (RKHS) representations for nonparametric statistics, and takes the form of a test statistic that is computed from pairs of individual geotagged observations without aggregation into predefined geographical bins. We compare this test with prior work using synthetic data as well as a diverse set of real data sets: a corpus of Dutch tweets, a Dutch syntactic atlas, and a data set of letters to the editor in North American newspapers. Our proposed test is shown to support robust inferences across a broad range of scenarios and types of data.
%R 10.1162/COLI_a_00293
%U https://aclanthology.org/J17-3003
%U https://doi.org/10.1162/COLI_a_00293
%P 567-592
Markdown (Informal)
[A Kernel Independence Test for Geographical Language Variation](https://aclanthology.org/J17-3003) (Nguyen & Eisenstein, CL 2017)
ACL