<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.001968">
<title confidence="0.966342">
Challenges of studying and processing dialects in social media
</title>
<author confidence="0.995963">
Anna Katrine Jørgensen, Dirk Hovy, and Anders Søgaard
</author>
<affiliation confidence="0.997241">
University of Copenhagen
</affiliation>
<address confidence="0.8667955">
Njalsgade 140
DK-2300 Copenhagen S
</address>
<email confidence="0.998262">
soegaard@hum.ku.dk
</email>
<sectionHeader confidence="0.993846" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.983551045454546">
Dialect features typically do not make
it into formal writing, but flourish
in social media. This enables large-
scale variational studies. We fo-
cus on three phonological features of
African American Vernacular English
and their manifestation as spelling
variations on Twitter. We discuss to
what extent our data can be used to
falsify eight sociolinguistic hypothe-
ses. To go beyond the spelling
level, we require automatic analysis
such as POS tagging, but social me-
dia language still challenges language
technologies. We show how both
newswire- and Twitter-adapted state-
of-the-art POS taggers perform signif-
icantly worse on AAVE tweets, sug-
gesting that large-scale dialect studies
of language variation beyond the sur-
face level are not feasible with out-of-
the-box NLP tools.
</bodyText>
<sectionHeader confidence="0.999132" genericHeader="keywords">
1 Introduction
</sectionHeader>
<bodyText confidence="0.999958525">
Dialectal and sociolinguistic studies are tradi-
tionally based on interviews of small sets of
speakers of each variety. The Atlas of North
American English (Labov et al., 2005) has
been the reference point for American dialec-
tology since its completion, but is based on
only 762 speakers. Dallas is represented by
four subjects, the New York City dialect by
six, etc. Data is costly to collect, and, as a
consequence, scarce.
Written language was traditionally used for
formal purposes, and therefore differed in
style from colloquial, spoken language. How-
ever, with the rise of social media platforms
and the vast production of user generated con-
tent, differences between written and spoken
language diminish. A number of recent papers
have explored social media with respect to
sociolinguistic and dialectological questions
(Rao et al., 2010; Eisenstein, 2013; Volkova
et al., 2013; Doyle, 2014; Hovy et al., 2015;
Volkova et al., 2015; Johannsen et al., 2015;
Hovy and Søgaard, 2015; Eisenstein, to ap-
pear). Emails, chats and social media posts
serve purposes similar to those of spoken lan-
guage, and consequently, features of spoken
language, such as interjections, ellipses, and
phonological variation, have found their way
into this type of written language. Our work
differs from most previous approaches by in-
vestigating several phonological spelling cor-
relates of a specific language variety.
The 284 million active users on Twitter post
more than half a billion tweets every day, and
some fraction of these tweets are geo-located.
Eisenstein (2013) and Doyle (2014) studied
the effect of phonological variation across the
US on spelling in Twitter posts, and both
found some evidence that dialectal phonolog-
ical variation has a direct impact on spelling
</bodyText>
<page confidence="0.970939">
9
</page>
<note confidence="0.988359">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 9–18,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999917634146342">
on Twitter. Both authors note various method-
ological problems using Twitter as a source of
evidence for dialectal and sociolinguistic stud-
ies, including what we refer to as USER POP-
ULATION BIAS and TOPIC BIAS below.
In this paper, we collect Twitter data to
test eight (8) research hypotheses originating
in sociolinguistic studies of African-American
Vernacular English (AAVE). The hypotheses
relate to three phonological features of AAVE,
namely derhotacization, interdental fricative
mutation, and backing in /str/. Some of our
findings shed an interesting light on existing
hypotheses, but our main focus in this paper
is to identify the methodological challenges in
using social media for testing sociolinguistic
hypotheses.
Almost all previous large-scale variational
studies using social media have focused on
spelling variation and lexical markers of di-
alect. Ours is no exception. However, di-
alectal variation also manifests itself at the
morpho-syntactic level. To investigate this
variation, we also annotate some data with
part-of-speech (POS) tags, using two NLP
systems. This approach reveals a severe
methodological challenge: sentences contain-
ing AAVE features are associated with signif-
icant drops in tagger performance.
This result challenges large-scale varia-
tional studies on social media that require au-
tomated analyses. The observed drops in per-
formance are prohibitive for studying syntac-
tic and semantic variation, and we believe the
NLP community should make an effort to pro-
vide better and more robust dialect-adapted
models to researchers and industry interested
in processing social media. The findings also
raise the question of whether NLP technology
systematically disadvantages groups of non-
standard language users.
</bodyText>
<subsectionHeader confidence="0.990296">
1.1 Contributions
</subsectionHeader>
<listItem confidence="0.998711913043478">
• We identify eight (8) research hypotheses
from the sociolinguistic literature. We
test them in a study of the distribution of
three phonological features typically as-
sociated with AAVE in Twitter data. We
test the features’ correlations with vari-
ous demographic variables. Our results
falsify the hypothesis that AAVE is male-
dominated (but see §3.1).
• We identify five (5) methodological
problems common to variational studies
in social media and discuss to what ex-
tent they compromise the validity of re-
sults.
• Further, we show that state-of-the-art
newswire and Twitter POS taggers per-
form much worse on tweets containing
AAVE features. This suggests an addi-
tional limitation to large-scale sociolin-
guistic research using social media data,
namely that it is hard to analyze varia-
tion beyond the lexical level with current
tools.
</listItem>
<subsectionHeader confidence="0.995279">
1.2 Sociolinguistic hypotheses
</subsectionHeader>
<bodyText confidence="0.9997259">
AAVE is, in contrast to other North American
dialects, not geographically restricted. Al-
though variation in AAVE does exist, AAVE
in urban settings has been established as a
uniform system with suprasegmental norms
(Ash and Myhill, 1986; Labov et al., 2005;
Labov, 2006; Wolfram, 2004). This paper
considers the following eight (8) hypotheses
from the sociolinguistic literature about
AAVE as a ethnolect:
</bodyText>
<footnote confidence="0.4263052">
H1: AAVE is an urban ethnolect (Rickford, 1999;
Wolfram, 2004).
H2: AAVE features are more present in the Gulf states
than in the rest of the United States (Rastogi et al.,
2011).
</footnote>
<page confidence="0.947115">
10
</page>
<listItem confidence="0.8813898125">
H3: The likelihood of speaking AAVE correlates
negatively with income and educational level,
and AAVE is more frequently appropriated by
men (Rickford, 1999; Rickford, 2010).
H4: Derhotacization is more frequent in African
Americans than in European Americans (Labov
et al., 2005; Rickford, 1999).
H5: Derhotacization is negatively correlated with in-
come and educational level (Rickford, 1999).
H6: Interdental fricative mutation is more frequent in
AAVE than in European American speech (Pol-
lock et al., 1998; Thomas, 2007).
H7: Interdental fricative mutation is predominantly
found in the Gulf states (Rastogi et al., 2011).
H8: Backing in /str/ (to /skr/) is unique to AAVE
(Rickford, 1999; Thomas, 2007; Labov, 2006).
</listItem>
<bodyText confidence="0.999811823529412">
Hypotheses 1–8 are investigated by corre-
lating the distribution of phonological variants
in geo-located tweets with demographic infor-
mation.
Our method is similar to those proposed
by Eisenstein (2013) and Doyle (2014), lend-
ing statistical power to sociolinguistic analy-
ses, and circumventing traditional issues with
data collection such as the Observer’s Para-
dox (Labov, 1972b; Meyerhof, 2006). Our
work differs from previous work by studying
phonological rules associated with specific di-
alects, as well as considering a wide range of
actual sociolinguistic research hypotheses, but
our main focus is the methodological prob-
lems doing this kind of work, as well as as-
sessing the limitations of such work.
</bodyText>
<subsectionHeader confidence="0.998867">
1.3 Methodological problems
</subsectionHeader>
<bodyText confidence="0.9992818125">
One obvious challenge relating social media
data to sociolinguistic studies is that there
is generally not a one-to-one relationship
between phonological variation and spelling
variation. People, in other words, do not spell
the way they pronounce. Eisenstein (2013)
discusses this challenge ((1) WRITING BIAS),
but shows that effects of the phonological en-
vironment carry over to social media, which
he interprets as evidence that there is at least
some causal link between pronunciation and
spelling variation.
A related problem is that non-speakers of
AAVE may cite known features of AAVE with
specific purposes in mind. They may use it in
citations, for example:
</bodyText>
<listItem confidence="0.9943135">
(1) My 5 year old sister texted me on my mums phone
saying “why did you take a picher in da bafroom”
lool okay b (Twitter, Feb 21 2015)
or in meta-linguistic discussions:
(2) Whenever I hear a black person inquire about the
location of the ”bafroom”... (Twitter, Jan 20 2015)
</listItem>
<bodyText confidence="0.98040325">
We refer to these phenomena as (2) META-
USE BIAS. This bias is important with rare
phenomena. With ”bafroom”, it seems that
about 1 in 20 occurrences on Twitter are meta-
uses. Meta-uses may also serve social func-
tions. AAVE features are used as cultural
markers by Latinos in North Carolina (Carter,
2013), for example.
Some of the research hypotheses consid-
ered (113 and 115) relate to demographic vari-
ables such as income and educational levels.
While we do not have socio-economic infor-
mation about the individual Twitter user, we
can use the geo-located tweets to study the
correlation between socio-economic variables
and linguistic features at the level of cities or
ZIP codes.1
Eisenstein et al. (2011) note that this level
of abstraction introduces some noise. Since
Twitter users do not form representative sam-
ples of the population, the mean income for a
city or ZIP code is not necessarily the mean
income for the Twitter users in that area. We
refer to this problem as the (3) USER POPU-
LATION BIAS.
Another serious methodological problem
known as (4) GALTON’S PROBLEM (Naroll,
1961; Roberts and Winters, 2013), is the ob-
servation that cross-cultural associations are
1Unlike many others, we rely on physical locations
rather than user-entered profile locations. See Graham
et al. (2014) for discussion.
</bodyText>
<page confidence="0.995706">
11
</page>
<bodyText confidence="0.99998318367347">
often explained by geographical diffusion. In
other words, it is the problem of discrimi-
nating historical from functional associations
in cross-cultural surveys. Briefly put, when
we sample tweets and income-levels from US
cities, there is little independence between
the city data points. Linguistic features dif-
fuse geographically and do not change at ran-
dom, and we can therefore expect to see more
spurious correlations than usual. Like with
the famous example of chocolate and Nobel
Prize winners, our positive findings may be
explained by hidden background variables. A
positive correlation between income-level and
a phonological pattern may also have cultural,
religious or geographical explanations.
Reasons to be less worried about GAL-
TON’S PROBLEM in our case, include that a)
we only consider standard hypotheses from
the sociolinguistics literature and not a huge
set of previously unexplored, automatically
generated hypotheses, b) we sample data
points at random from all across the US, giv-
ing us a very sparse distribution compared
to country-level data, but more notably, c)
location is an important, explicit variable in
our study. GALTON’S PROBLEM is typically
identified by clustering tests based on loca-
tion (Naroll, 1961). Obviously, the phono-
logical features considered here cluster geo-
graphically, as evidenced by our geographic
correlations in Table 2, but since our studies
explicitly test the influence of location, it is
not the case for most of the hypotheses con-
sidered here that geographic diffusion is the
underlying explanation for something else.
In §3, we discuss whether these four
methodological problems compromise the va-
lidity of our findings. One other methodolog-
ical problems that may be relevant for other
studies of dialect in social media, is almost
completely irrelevant for our study: It is often
important to control for topic in dialectal and
sociolinguistic studies (Bamman et al., 2014),
e.g., when studying the lexical preferences of
speakers of urban ethnolects. We call this
problem (5) TOPIC BIAS. Using word pairs
with equivalent meanings for our studies, we
implicitly control for topic (but see §3.1).
</bodyText>
<table confidence="0.999142612903226">
Feature Positive Negative Total count
brotha brother 9528
foreva forever 3673
hea here 4352
lova lover 1273
motha mother 4668
/r/ /Ø/ or /@/ ova over 3441
sista sister 5325
wateva whatever 2974
wea where 5153
total 40,387
kreet street 1226
/str/ /skr/ :krong strong 1629
skrip strip 1101
total 3956
brova brother 3715
dat that 2610
deez these 4477
/D/ /d/or/v/ dem them 3645
dey they 2434
dis this 2135
mova mother 2462
total 21,478
mouf mouth 3861
nuffin nothing 2861
souf south 1102
/T/ /t/ or /f/ teef teeth 1857
trough through 2804
trow throw 1090
total 13,575
All tweets 79,396
</table>
<tableCaption confidence="0.999683">
Table 1: Word pairs and counts
</tableCaption>
<sectionHeader confidence="0.872754" genericHeader="introduction">
2 Data and Method
</sectionHeader>
<bodyText confidence="0.99911725">
We focus on derhotacization, backing in /str/,
and interdental fricative mutation. Specifi-
cally, we collect data to study the following
four phonological variations (the latter two are
both instances of interdental fricative muta-
tion): a) derhotacization: /r/ → /Ø/ or /@/,
b) /str/ → /skr/, c) /D/ → /d/ or /v/ and, d) /T/
→ /t/ or /f/.
In non-rhotic dialects, /r/ is either not pro-
nounced or is approximated as a vocalization
in the surface form, when /r/ is in a pre-vocalic
position. This can result in an elongation of
the preceding vowel or in an off-glide schwa
/@/, e.g., guard → /gA:d/, car → /ka:/, fear →
/fi@/ (Thomas, 2007).
Backing in /skr/ denotes the substitution
</bodyText>
<page confidence="0.994">
12
</page>
<bodyText confidence="0.999508512195122">
of /str/ for /skr/ in word-initial positions re-
sulting in pronunciations such as /skrit/ for
street, /skraq/ for strong and /skrT/ for strip.
Backing in /str/ has been reported to be a
unique feature in AAVE, as it is unheard
in other North American dialects (Rickford,
1999; Labov, 1972a; Thomas, 2007).
The two interdental fricative mutations re-
late to substitutions of /d/ and /0/ by /d/, /v/
and /t/, /f/ in words such as that and mother
or nothing and with. It has been reported
that mutations of /d/ and /0/ are more com-
mon among African Americans than among
European Americans and that the frequency
of the mutations is inversely correlated with
socio-economic levels and formality of speak-
ing (Rickford, 1999).
We follow Eisenstein (2013) and Doyle
(2014) in assuming that spelling variation may
be a result of phonological differences and
select 25 word pairs for our study (Tabel
1). For each word pair, we collect positive
(e.g., ”skreet”) and negative occurrences (e.g.,
”street”), resulting in a total number of 79,396
tweets. The word pairs were chosen based on
the unambiguity, frequency and representabil-
ity of the phonological variations. Uniquely,
backing in /str/ is represented by three word
pairs of high similarity, which is due to phono-
logical restrictions on the variation of /str/ to
/skr/ and to the fact that backing in /str/ is a
very rare phenomena.
The Twitter data used in the experiments
was gathered from May to August 2014 us-
ing TwitterSearch.2 We only collected tweets
with geo-locations in the contiguous United
States, from users reporting to tweet in En-
glish, and which were also predicted to be
in English using langid.py.3 The demo-
graphic information was obtained from the
2012 American Community Survey from the
</bodyText>
<footnote confidence="0.9984245">
2https://pypi.python.org/pypi/TwitterSearch/
3https://pypi.python.org/pypi/langid
</footnote>
<bodyText confidence="0.9809626">
United States Census Bureau, as was informa-
tion about population sizes in US cities. We
linked each tweet in our data to demographic
information using the geo-coordinates of the
tweet and its nearest city in the following way.
</bodyText>
<figureCaption confidence="0.9254915">
Figure 1: The ratio of AAVE examples over
US states
</figureCaption>
<bodyText confidence="0.999940642857143">
For the 110 US cities of ≥ 200,000 inhabi-
tants, we gathered information about: a) per-
centage high school graduates, b) percent-
age below poverty level, c) population size,
d) median household income, e) percentage of
males, f) percentage between 15 and 24 years
old, g) percentage of African Americans and
h) unemployment rate.
The overall geographical distribution of our
data is shown in Figure 1. The map shows that
we see more tweets with AAVE features in
the Gulf states, in particular Louisiana, Mis-
sissippi and Georgia. This lends preliminary
support to H2.
</bodyText>
<sectionHeader confidence="0.999118" genericHeader="method">
3 Results with phonological features
</sectionHeader>
<bodyText confidence="0.999745111111111">
Occurrences of the phonological variations
related to AAVE were correlated with the
geographic and demographic variables using
Spearman’s ρ (Table 2–3), at the level of in-
dividual tweets. From the correlation coeffi-
cients we see that the distributions of the three
chosen AAVE rules are best explained by lon-
gitude, the distinction between the Gulf states
and the rest of the US, and by the distribution
</bodyText>
<page confidence="0.998998">
13
</page>
<tableCaption confidence="0.994739">
– = p &gt; 0.05, * = 0.05 &gt; p &gt; 0.01, ** = p &lt; 0.01, , *** = p &lt; 0.0005
Shading corresponds to negative correlations
Table 3: Demographic correlations
</tableCaption>
<bodyText confidence="0.944546">
Feature word pairs male black 15-24 citysize highschool income poverty unemployment
</bodyText>
<equation confidence="0.952916347826087">
skreet/street – – – ** * ** * **
skrong/strong ** *** – * ** ** ** *
skrip/strip * – * *** *** – *** ***
/str/ —. /skr/
brova/brother *** *** *** *** *** – *** ***
dat/that – *** – – – ** ** –
deez/these – – – ** *** – ** ***
dem/them***********–– –
dey/they *** *** ** * ** ** *** –
dis/this–*****–– –**
mova/mother *** *** *** – *** *** – ***
/D/ —. /d/ or /v/
total *** *** *** *** – ** – *
/r/ —. /fb/ or /@/
brotha/brother *** ** – – ** – –
foreva/forever ** *** – – – – ** –
hea/here – *** ** *** *** *** *** *
lova/lover ––––****** –
motha/mother –**–*–**– –
ova/over ******–––****** –
sista/sister * *** – – ** – –
wateva/whatever *** *** – – – *** *** –
wea/where ** *** *** *** *** *** ***
</equation>
<figure confidence="0.8814922">
total ***
*** ***
***
***
***
***
–
–
*
– -
***
total ***
***
***
***
/T/ —. /t/ or /f/
total *** *** *** – *** – *** ***
mouf/mouth ** – – – – –
nuffin/nothing *** *** *** *** *** *** – ***
souf/south *** – ** – ** – *** ***
teef/teeth – – – – ** – – –
trough/through – – –
trow/throw * – –
–
***
** – *
** * **
– –
*
**
</figure>
<bodyText confidence="0.994996209302326">
of African Americans (with explained vari-
ances in the range of 0.03-0.05).
Our data suggests that H2, namely that
AAVE is more prevalent in the Gulf states,
is probably true. Hypothesis H1, that AAVE
is an urban ethnolect, lends some support in
our data, but the correlation with urbanicity
is weaker (and negatively correlated or non-
significant in half of the cases).
Our data only lends limited support to the
first half of hypothesis H3. While derhota-
cization and /str/ correlate (negatively) signif-
icantly with income levels, we see no signifi-
cant correlations within /D/ and a positive cor-
relation within /T/. However, our data does not
suggest that H3 is false, either. Our data does
lend support to the more specific hypothesis
H5, namely that derhoticization is sensitive to
income level, while the strong correlation with
the distribution of African Americans lends
support to H4.
More interestingly, our data suggests that
women use AAVE features more often than
men, i.e., there is a negative correlation be-
tween male gender and AAVE features, con-
trary to the second half of H3, namely that
AAVE is more frequently appropriated by
men. Note, however, that our gender ratios
are aggregated for city areas, and with the de-
mographic bias of Twitter, these correlations
should be taken with a grain of salt. Consider-
ing the small gender ratio differences, we also
compute correlations between our linguistic
features and gender using the Rovereto Twit-
ter N-gram Corpus (RTC) (Herdagdelen and
Baroni, 2011).4 The RTC corpus contains in-
formation about the gender of the tweeter as-
sociated with n-grams. While there is too lit-
tle data in the corpus to correlate gender and
backing in /str/, derhotacization and both in-
terdental fricative mutations (/D/ → /d/ or /v/
and /T/ → /t/ or /f/) correlate significantly with
women. Out of our words, 10 correlate sig-
</bodyText>
<footnote confidence="0.932756">
4http://clic.cimec.unitn.it/amac/
twitter_ngram/
</footnote>
<page confidence="0.99812">
14
</page>
<bodyText confidence="0.94444">
Feature word pairs latitude longitude urban Gulf
</bodyText>
<equation confidence="0.906226">
– = p &gt; 0.05, * = 0.05 &gt; p &gt; 0.01, ** = p &lt; 0.01, ***
= p &lt; 0.0001
</equation>
<tableCaption confidence="0.90128">
Shading corresponds to negative correlations
Table 2: Geographic correlations
</tableCaption>
<bodyText confidence="0.9995195">
nificantly with female speakers; seven with
male. The correlations are found in Table 4.
For each feature, certain words correlate sig-
nificantly with female speakers, while oth-
ers correlate significantly with male speakers.
Consequently, neither our Twitter data not the
Twitter data in the RTC suggest that AAVE is
more often appropriated by men. We discuss
whether our data provides a basis for falsify-
ing the second half of H3 in §3.1.
The high correlation between mutations of
/D/ and longitude supports the presence of
these mutations of /D/ in non-standard north-
ern varieties (Rickford, 1999). The mutation
of /T/ is also correlated with longitude, and
with latitude, suggesting an Eastern Ameri-
can feature rather than a distinct Southern fea-
ture (Rickford, 1999). The variation in muta-
tions could possibly be explained by both ge-
ography as well as the distribution og African
Americans.
There is evidence in our data that backing
in /str/ (to /skr/) is appropriated more often by
AAVE speakers than by speakers of other di-
alects (H8). There is also a negative correla-
tion between latitude and backing in /str/ as
well as a strong positive correlation with the
Gulf states, suggesting that backing in /str/ is a
feature primarily seen in this region. The data
thereby suggests that the feature is appropri-
ated significantly more by African Americans
than by speakers of the Southern dialect.
In sum, while our data lends support to sev-
eral of the common hypotheses from the so-
ciolinguistics literature, we found one unex-
pected tendency, going against the second half
of H3, namely that AAVE features were found
more often with females. We now discuss this
finding in light of the methodological prob-
lems discussed in §1.2.
</bodyText>
<table confidence="0.997626739130435">
Feature word pairs male
brotha-brother **
foreva-forever **
hea-here *
lova-lover –
/r/ —. /fb/ or /@/ motha-mother **
ova-over **
sista-sister –
wateva-whatever –
wea-where **
brova-brother *
dat-that **
deez-these **
D —. /d/ or /v/ dem-them **
dey-they **
dis-this **
mova-mother –
mouf-mouth **
nuffin-nothing **
souf-south **
T —. /f/ or /t/ teef-teeth –
trough-through **
trow-throw **
</table>
<tableCaption confidence="0.967264">
– = p &gt; 0.05, * = 0.05 &gt; p &gt; 0.01, ** = p &lt; 0.01
Shading corresponds to negative correlations
Table 4: Gender correlations in RTC
</tableCaption>
<subsectionHeader confidence="0.996744">
3.1 Is AAVE not male-dominated?
</subsectionHeader>
<bodyText confidence="0.999428">
We now discuss whether our data falsifies
the second half of H3, one methodological
problem at a time (see §1.3). If WRITTEN
BIAS were to bias our conclusions, one gen-
der should be more likely to exhibit more
phonologically motivated spelling variation.
This may actually be true, since it is well-
</bodyText>
<figure confidence="0.999356969230769">
total
***
*** *** ***
trow/throw
***
** – ***
brotha/brother *** ***
** – ***
*** * ***
*** ***
lova/lover *** *** ** ***
motha/mother––*** –
ova/over *** – – ***
/r/
***
***
***
***
wateva/whatever
wea/where
total
skreet/street
skrong/strong
skrip/strip
total
***
***
***
***
***
***
***
*** ** ***
***
***
–
* *** ***
*** ***
–
** – ***
sista/sister – ***
/str/
–
brova/brother *** *** *** ***
dat/that *** * – ***
deez/these****– –
/D/ dem/them *** *** – ***
dey/they *** *** – ***
dis/this *** – – ***
mova/mother * *** *** ***
mouf/mouth *** – – ***
nuffin/nothing *** *** *** ***
souf/south *** *** *** ***
teef/teeth ** – ** ***
trough/through – ***
total * *** *** ***
/T/
– –
foreva/forever
hea/here
***
***
**
***
***
</figure>
<page confidence="0.964008">
15
</page>
<bodyText confidence="0.999755809523809">
established that women tend to be more lin-
guistically creative and have larger vocabular-
ies (Labov, 1990; Brizendine, 2006). Whether
women are also more meta-linguistic (META-
USE BIAS), has to the best of our knowl-
edge not been studied. Since genders are al-
most equally geographically distributed, and
since Twitter is generally considered gender-
balanced, neither USER POPULATION BIAS
nor GALTON’S PROBLEM is likely to bias our
conclusions. TOPIC BIAS, on the other hand,
may. While our semantically equivalent pairs
control for topic, the pragmatics sometimes
differ. Just like code-switching is a strategy
for bilinguals, using the spelling motha in-
stead of mother could mean something, say
irony, which one gender is more prone for. In
sum, while we do believe that our data should
lead sociolinguists to question whether AAVE
is male-dominated, our findings may be bi-
ased by WRITTEN BIAS.
</bodyText>
<sectionHeader confidence="0.995763" genericHeader="method">
4 POS tagging
</sectionHeader>
<bodyText confidence="0.999788266666667">
We need automated syntactic analysis to study
morpho-syntactic dialectal variation. We
ran a state-of-the-art POS tagger trained on
newswire5 (STANFORD), as well as two state-
of-the-art POS taggers adapted to Twitter,
namely GATE6 and ARK7, on our data. We
had one professional annotator manually an-
notate 100 positive (AAVE) and 100 nega-
tive (non-AAVE) sentences using the coarse-
grained tags proposed by Petrov et al. (2011).
We map the tagger outputs to those tags and
report tagging accuracies. See Table 5 for re-
sults, with A(+, −) being the absolute dif-
ference in performance from non-AAVE to
AAVE.
</bodyText>
<footnote confidence="0.977766333333333">
5http://nlp.stanford.edu/software/
tagger.shtml
6https://gate.ac.uk/wiki/
twitter-postagger.html
7http://www.ark.cs.cmu.edu/
TweetNLP/
</footnote>
<table confidence="0.997828">
STANFORD GATE ARK
AAVE 61.4 79.1 77.5
non-AAVE 74.5 83.3 77.9
Δ(+,-) 13.1 4.2 0.4
</table>
<tableCaption confidence="0.998666">
Table 5: POS tagging accuracies (%)
</tableCaption>
<bodyText confidence="0.999778529411765">
While GATE is certainly better than STAN-
FORD on our data, performance is generally
poor and prohibitive of many downstream ap-
plications and variational studies. We also
note that both the best and worst tagger per-
form significantly worse on AAVE tweets
than on non-AAVE tweets. What are the
sources of error in the AAVE data? One ex-
ample is the word brotha, which is tagged as
a both an adverb, a verb, and as X (foreign
words, mark-up, etc.). Contractions like finna
(”fixing to” meaning ”going to”) and gimme
(”give me”) are often tagged as particles, but
annotated as verbs or, as in the case of witchu
(”with you”), as a preposition. Another inter-
esting mistake is tagging adverbial like as a
verb.
</bodyText>
<sectionHeader confidence="0.998833" genericHeader="conclusions">
5 Conclusion
</sectionHeader>
<bodyText confidence="0.999948769230769">
Large-scale variational studies of social me-
dia can be used to question received wisdom
about dialects, lending support to some soci-
olinguistic research hypotheses and question-
ing others. However, we caution that our re-
sults were biased by several factors, includ-
ing the representativity of the social media
user bases. We also show how state-of-the-
art POS taggers are more likely to fail on
dialects in social media. The performance
drops may be considered prohibitive of study-
ing morph-syntactic patterns across dialects
and as a challenge to us as a community.
</bodyText>
<sectionHeader confidence="0.987933" genericHeader="references">
References
</sectionHeader>
<bodyText confidence="0.5651825">
Sharon Ash and John Myhill. 1986. Linguis-
tic correlates of inter-ethnic contact. In David
</bodyText>
<page confidence="0.994112">
16
</page>
<reference confidence="0.995123674157304">
Sankoff, editor, Diversity and Diachronyc,
pages 33–44, Amsterdam and Philadelphia.
John Benjamins Publishing Co.
David Bamman, Jacob Eisenstein, and Tyler Sch-
noebelen. 2014. Gender identity and lexical
variation in social media. Journal of Sociolin-
guistics, 18.
Louann Brizendine. 2006. The Female Brain.
Morgan Road Books.
Phillip Carter. 2013. Shared spaces, shared
structures: Latino social formation and African
American English in the U.S. south. Journal of
Sociolinguistics, 17:66–92.
Gabriel Doyle. 2014. Mapping dialectal varia-
tion by querying social media. In EACL, pages
98–106, Gothenburg, Sweden. Association for
Computational Linguistics.
Jacob Eisenstein, Noah A. Smith, and Eric Xing.
2011. Discovering sociolinguistic associations
with structured sparsity. In ACL.
Jacob Eisenstein. 2013. Phonological factors in
social media writing. In NAACL Workshop on
Language Analysis in Social Media, pages 11–
19, Atlanta, Georgia. Association for Computa-
tional Linguistics.
Jacob Eisenstein. to appear. Systematic patterning
in phonologically-motivated orthographic vari-
ation. Journal of Sociolinguistics.
Mark Graham, Scott Hale, and Devin Gaffney.
2014. Where in the world are you? Geoloca-
tion and language identification on Twitter. The
Professional Geographer, 66(4).
Amac Herdagdelen and Marco Baroni. 2011.
Stereotypical gender actions can be extracted
from web text. Journal of the American So-
ciety for Information Science and Technology,
62:1741–1749.
Dirk Hovy and Anders Søgaard. 2015. Tag-
ging performance correlates with author age. In
ACL.
Dirk Hovy, Anders Johannsen, and Anders
Søgaard. 2015. User review-sites as a
source for large-scale sociolinguistic studies. In
WWW.
Anders Johannsen, Dirk Hovy, and Anders
Søgaard. 2015. Cross-lingual syntactic varia-
tion over age and gender. In CoNLL.
William Labov, Sharon Ash, and Charles Boberg.
2005. The Atlas of North American En-
glish Phonetics, Phonology and Sound Change.
Mouton de Gruyter, New York, NY.
William Labov. 1972a. Language in the Inner
City: Studies in the Black English Vernacular.
University of Pennsylvania Press.
William Labov. 1972b. Sociolingustic Patterns.
University of Pennsylvania Press, Philadelphia,
PA.
William Labov. 1990. The intersection of sex and
social class in the course of linguistic change.
Language Variation and Change, 2:205–254, 7.
William Labov. 2006. Unendangered dialects,
endangered people. In Natalie Schilling-Estes,
editor, GURT’06.
Miriam Meyerhof. 2006. Introducing Sociolin-
guistics. Routledge.
R Naroll. 1961. Two solutions to Galton’s prob-
lem. Philosophy of Science, 28.
Slav Petrov, Dipanjan Das, and Ryan McDonald.
2011. A universal part-of-speech tagset. CoRR
abs/1104.2086.
K.E. Pollock, G. Bailey, M. Berni, D. Fletcher,
L. Hinton, I. Johnson, J. Roberts,
and R. Weaver. 1998. Phonologi-
cal features of african american english.
http://www.rehabmed.ualberta.ca/spa/phono-
logy/features.htm.
Delip Rao, David Yarowsky, Abhishek Shreevats,
and Manaswi Gupta. 2010. Classifying la-
tent user attributes in twitter. In Proceedings of
the 2nd International Workshop on Search and
Mining User-generated Contents, pages 37–44.
ACM.
Sonya Rastogi, Tallese D. Johnson, Elizabeth M.
Hoeffel, and Malcolm P. Drewery Jr. 2011. The
black population: 2010. Technical report, US
Census, September.
John Rickford. 1999. African American Vernac-
ular English: Features, Evolution, Educational
Implications. Blackwell, Malden, MA.
</reference>
<page confidence="0.987736">
17
</page>
<reference confidence="0.99991032">
John Rickford. 2010. Geographical diversity, res-
idential segregation, and the vitality of african
american vernacular english and its speakers.
Transforming Anthropology, 18(1):28–34.
Sean Roberts and James Winters. 2013. Linguis-
tic diversity and traffic accidents: lessons from
statistical studies of cultural traits. PLoS ONE,
8(8).
Eric Thomas. 2007. Phonological and phonetic
characteristics of african american vernacular
english. Language and Linguistic Compass,
1(5):450–475.
Svitlana Volkova, Theresa Wilson, and David
Yarowsky. 2013. Exploring demographic lan-
guage variations to improve multilingual senti-
ment analysis in social media. In EMNLP.
Svitlana Volkova, Yoram Bachrach, Michael Arm-
strong, and Vijay Sharma. 2015. Inferring la-
tent user properties from texts published in so-
cial media (demo). In AAAI.
Walt Wolfram. 2004. The grammar of urban
african american vernacular english. In Kor-
mann B. and E. Schneider, editors, Handbook
of Varieties of English, pages 111–132, Berlin.
Mouton de Gruyter.
</reference>
<page confidence="0.99929">
18
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.354083">
<title confidence="0.560695">Challenges of studying and processing dialects in social media</title>
<author confidence="0.570859">Anna Katrine Jørgensen</author>
<author confidence="0.570859">Dirk Hovy</author>
<author confidence="0.570859">Anders</author>
<affiliation confidence="0.8851035">University of Njalsgade</affiliation>
<address confidence="0.956358">DK-2300 Copenhagen</address>
<email confidence="0.992394">soegaard@hum.ku.dk</email>
<abstract confidence="0.994671">Dialect features typically do not make it into formal writing, but flourish social media. This enables largevariational studies. We cus on three phonological features of African American Vernacular English and their manifestation as spelling variations on Twitter. We discuss to what extent our data can be used to eight sociolinguistic hypotheses. To go beyond the level, we require automatic analysis such as POS tagging, but social media language still challenges language technologies. We show how both newswireand Twitter-adapted stateof-the-art POS taggers perform significantly worse on AAVE tweets, suggesting that large-scale dialect studies of language variation beyond the surface level are not feasible with out-ofthe-box NLP tools.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="false">
<booktitle>Diversity and Diachronyc,</booktitle>
<pages>33--44</pages>
<editor>Sankoff, editor,</editor>
<publisher>John Benjamins Publishing Co.</publisher>
<location>Amsterdam and Philadelphia.</location>
<marker></marker>
<rawString>Sankoff, editor, Diversity and Diachronyc, pages 33–44, Amsterdam and Philadelphia. John Benjamins Publishing Co.</rawString>
</citation>
<citation valid="true">
<authors>
<author>David Bamman</author>
<author>Jacob Eisenstein</author>
<author>Tyler Schnoebelen</author>
</authors>
<title>Gender identity and lexical variation in social media.</title>
<date>2014</date>
<journal>Journal of Sociolinguistics,</journal>
<volume>18</volume>
<contexts>
<context position="11793" citStr="Bamman et al., 2014" startWordPosition="1823" endWordPosition="1826">ally, as evidenced by our geographic correlations in Table 2, but since our studies explicitly test the influence of location, it is not the case for most of the hypotheses considered here that geographic diffusion is the underlying explanation for something else. In §3, we discuss whether these four methodological problems compromise the validity of our findings. One other methodological problems that may be relevant for other studies of dialect in social media, is almost completely irrelevant for our study: It is often important to control for topic in dialectal and sociolinguistic studies (Bamman et al., 2014), e.g., when studying the lexical preferences of speakers of urban ethnolects. We call this problem (5) TOPIC BIAS. Using word pairs with equivalent meanings for our studies, we implicitly control for topic (but see §3.1). Feature Positive Negative Total count brotha brother 9528 foreva forever 3673 hea here 4352 lova lover 1273 motha mother 4668 /r/ /Ø/ or /@/ ova over 3441 sista sister 5325 wateva whatever 2974 wea where 5153 total 40,387 kreet street 1226 /str/ /skr/ :krong strong 1629 skrip strip 1101 total 3956 brova brother 3715 dat that 2610 deez these 4477 /D/ /d/or/v/ dem them 3645 de</context>
</contexts>
<marker>Bamman, Eisenstein, Schnoebelen, 2014</marker>
<rawString>David Bamman, Jacob Eisenstein, and Tyler Schnoebelen. 2014. Gender identity and lexical variation in social media. Journal of Sociolinguistics, 18.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Louann Brizendine</author>
</authors>
<title>The Female Brain.</title>
<date>2006</date>
<publisher>Morgan Road Books.</publisher>
<contexts>
<context position="23249" citStr="Brizendine, 2006" startWordPosition="3835" endWordPosition="3836">trong skrip/strip total *** *** *** *** *** *** *** *** ** *** *** *** – * *** *** *** *** – ** – *** sista/sister – *** /str/ – brova/brother *** *** *** *** dat/that *** * – *** deez/these****– – /D/ dem/them *** *** – *** dey/they *** *** – *** dis/this *** – – *** mova/mother * *** *** *** mouf/mouth *** – – *** nuffin/nothing *** *** *** *** souf/south *** *** *** *** teef/teeth ** – ** *** trough/through – *** total * *** *** *** /T/ – – foreva/forever hea/here *** *** ** *** *** 15 established that women tend to be more linguistically creative and have larger vocabularies (Labov, 1990; Brizendine, 2006). Whether women are also more meta-linguistic (METAUSE BIAS), has to the best of our knowledge not been studied. Since genders are almost equally geographically distributed, and since Twitter is generally considered genderbalanced, neither USER POPULATION BIAS nor GALTON’S PROBLEM is likely to bias our conclusions. TOPIC BIAS, on the other hand, may. While our semantically equivalent pairs control for topic, the pragmatics sometimes differ. Just like code-switching is a strategy for bilinguals, using the spelling motha instead of mother could mean something, say irony, which one gender is more</context>
</contexts>
<marker>Brizendine, 2006</marker>
<rawString>Louann Brizendine. 2006. The Female Brain. Morgan Road Books.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Phillip Carter</author>
</authors>
<title>Shared spaces, shared structures: Latino social formation and African</title>
<date>2013</date>
<journal>American English in the U.S. south. Journal of Sociolinguistics,</journal>
<pages>17--66</pages>
<contexts>
<context position="8844" citStr="Carter, 2013" startWordPosition="1364" endWordPosition="1365">use it in citations, for example: (1) My 5 year old sister texted me on my mums phone saying “why did you take a picher in da bafroom” lool okay b (Twitter, Feb 21 2015) or in meta-linguistic discussions: (2) Whenever I hear a black person inquire about the location of the ”bafroom”... (Twitter, Jan 20 2015) We refer to these phenomena as (2) METAUSE BIAS. This bias is important with rare phenomena. With ”bafroom”, it seems that about 1 in 20 occurrences on Twitter are metauses. Meta-uses may also serve social functions. AAVE features are used as cultural markers by Latinos in North Carolina (Carter, 2013), for example. Some of the research hypotheses considered (113 and 115) relate to demographic variables such as income and educational levels. While we do not have socio-economic information about the individual Twitter user, we can use the geo-located tweets to study the correlation between socio-economic variables and linguistic features at the level of cities or ZIP codes.1 Eisenstein et al. (2011) note that this level of abstraction introduces some noise. Since Twitter users do not form representative samples of the population, the mean income for a city or ZIP code is not necessarily the </context>
</contexts>
<marker>Carter, 2013</marker>
<rawString>Phillip Carter. 2013. Shared spaces, shared structures: Latino social formation and African American English in the U.S. south. Journal of Sociolinguistics, 17:66–92.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Gabriel Doyle</author>
</authors>
<title>Mapping dialectal variation by querying social media. In</title>
<date>2014</date>
<booktitle>EACL,</booktitle>
<pages>98--106</pages>
<institution>Gothenburg, Sweden. Association for Computational Linguistics.</institution>
<contexts>
<context position="1890" citStr="Doyle, 2014" startWordPosition="288" endWordPosition="289">rs. Dallas is represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets every day, and so</context>
<context position="7081" citStr="Doyle (2014)" startWordPosition="1078" endWordPosition="1079">ely correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our work differs from previous work by studying phonological rules associated with specific dialects, as well as considering a wide range of actual sociolinguistic research hypotheses, but our main focus is the methodological problems doing this kind of work, as well as assessing the limitations of such work. 1.3 Methodological problems One obvious challenge relating social media data to sociolinguistic studies is that</context>
<context position="14094" citStr="Doyle (2014)" startWordPosition="2221" endWordPosition="2222">n /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans than among European Americans and that the frequency of the mutations is inversely correlated with socio-economic levels and formality of speaking (Rickford, 1999). We follow Eisenstein (2013) and Doyle (2014) in assuming that spelling variation may be a result of phonological differences and select 25 word pairs for our study (Tabel 1). For each word pair, we collect positive (e.g., ”skreet”) and negative occurrences (e.g., ”street”), resulting in a total number of 79,396 tweets. The word pairs were chosen based on the unambiguity, frequency and representability of the phonological variations. Uniquely, backing in /str/ is represented by three word pairs of high similarity, which is due to phonological restrictions on the variation of /str/ to /skr/ and to the fact that backing in /str/ is a very </context>
</contexts>
<marker>Doyle, 2014</marker>
<rawString>Gabriel Doyle. 2014. Mapping dialectal variation by querying social media. In EACL, pages 98–106, Gothenburg, Sweden. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jacob Eisenstein</author>
<author>Noah A Smith</author>
<author>Eric Xing</author>
</authors>
<title>Discovering sociolinguistic associations with structured sparsity.</title>
<date>2011</date>
<booktitle>In ACL.</booktitle>
<contexts>
<context position="9248" citStr="Eisenstein et al. (2011)" startWordPosition="1426" endWordPosition="1429">henomena. With ”bafroom”, it seems that about 1 in 20 occurrences on Twitter are metauses. Meta-uses may also serve social functions. AAVE features are used as cultural markers by Latinos in North Carolina (Carter, 2013), for example. Some of the research hypotheses considered (113 and 115) relate to demographic variables such as income and educational levels. While we do not have socio-economic information about the individual Twitter user, we can use the geo-located tweets to study the correlation between socio-economic variables and linguistic features at the level of cities or ZIP codes.1 Eisenstein et al. (2011) note that this level of abstraction introduces some noise. Since Twitter users do not form representative samples of the population, the mean income for a city or ZIP code is not necessarily the mean income for the Twitter users in that area. We refer to this problem as the (3) USER POPULATION BIAS. Another serious methodological problem known as (4) GALTON’S PROBLEM (Naroll, 1961; Roberts and Winters, 2013), is the observation that cross-cultural associations are 1Unlike many others, we rely on physical locations rather than user-entered profile locations. See Graham et al. (2014) for discus</context>
</contexts>
<marker>Eisenstein, Smith, Xing, 2011</marker>
<rawString>Jacob Eisenstein, Noah A. Smith, and Eric Xing. 2011. Discovering sociolinguistic associations with structured sparsity. In ACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jacob Eisenstein</author>
</authors>
<title>Phonological factors in social media writing.</title>
<date>2013</date>
<booktitle>In NAACL Workshop on Language Analysis in Social Media,</booktitle>
<pages>11--19</pages>
<location>Atlanta,</location>
<contexts>
<context position="1855" citStr="Eisenstein, 2013" startWordPosition="282" endWordPosition="283">pletion, but is based on only 762 speakers. Dallas is represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half</context>
<context position="7064" citStr="Eisenstein (2013)" startWordPosition="1075" endWordPosition="1076">otacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our work differs from previous work by studying phonological rules associated with specific dialects, as well as considering a wide range of actual sociolinguistic research hypotheses, but our main focus is the methodological problems doing this kind of work, as well as assessing the limitations of such work. 1.3 Methodological problems One obvious challenge relating social media data to sociolinguisti</context>
<context position="14077" citStr="Eisenstein (2013)" startWordPosition="2218" endWordPosition="2219">/ for strip. Backing in /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans than among European Americans and that the frequency of the mutations is inversely correlated with socio-economic levels and formality of speaking (Rickford, 1999). We follow Eisenstein (2013) and Doyle (2014) in assuming that spelling variation may be a result of phonological differences and select 25 word pairs for our study (Tabel 1). For each word pair, we collect positive (e.g., ”skreet”) and negative occurrences (e.g., ”street”), resulting in a total number of 79,396 tweets. The word pairs were chosen based on the unambiguity, frequency and representability of the phonological variations. Uniquely, backing in /str/ is represented by three word pairs of high similarity, which is due to phonological restrictions on the variation of /str/ to /skr/ and to the fact that backing in</context>
</contexts>
<marker>Eisenstein, 2013</marker>
<rawString>Jacob Eisenstein. 2013. Phonological factors in social media writing. In NAACL Workshop on Language Analysis in Social Media, pages 11– 19, Atlanta, Georgia. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Jacob Eisenstein</author>
</authors>
<title>to appear. Systematic patterning in phonologically-motivated orthographic variation.</title>
<journal>Journal of Sociolinguistics.</journal>
<marker>Eisenstein, </marker>
<rawString>Jacob Eisenstein. to appear. Systematic patterning in phonologically-motivated orthographic variation. Journal of Sociolinguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Mark Graham</author>
<author>Scott Hale</author>
<author>Devin Gaffney</author>
</authors>
<title>Where in the world are you? Geolocation and language identification on Twitter.</title>
<date>2014</date>
<booktitle>The Professional Geographer,</booktitle>
<volume>66</volume>
<issue>4</issue>
<contexts>
<context position="9837" citStr="Graham et al. (2014)" startWordPosition="1522" endWordPosition="1525">odes.1 Eisenstein et al. (2011) note that this level of abstraction introduces some noise. Since Twitter users do not form representative samples of the population, the mean income for a city or ZIP code is not necessarily the mean income for the Twitter users in that area. We refer to this problem as the (3) USER POPULATION BIAS. Another serious methodological problem known as (4) GALTON’S PROBLEM (Naroll, 1961; Roberts and Winters, 2013), is the observation that cross-cultural associations are 1Unlike many others, we rely on physical locations rather than user-entered profile locations. See Graham et al. (2014) for discussion. 11 often explained by geographical diffusion. In other words, it is the problem of discriminating historical from functional associations in cross-cultural surveys. Briefly put, when we sample tweets and income-levels from US cities, there is little independence between the city data points. Linguistic features diffuse geographically and do not change at random, and we can therefore expect to see more spurious correlations than usual. Like with the famous example of chocolate and Nobel Prize winners, our positive findings may be explained by hidden background variables. A posi</context>
</contexts>
<marker>Graham, Hale, Gaffney, 2014</marker>
<rawString>Mark Graham, Scott Hale, and Devin Gaffney. 2014. Where in the world are you? Geolocation and language identification on Twitter. The Professional Geographer, 66(4).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Amac Herdagdelen</author>
<author>Marco Baroni</author>
</authors>
<title>Stereotypical gender actions can be extracted from web text.</title>
<date>2011</date>
<journal>Journal of the American Society for Information Science and Technology,</journal>
<pages>62--1741</pages>
<contexts>
<context position="19219" citStr="Herdagdelen and Baroni, 2011" startWordPosition="3133" endWordPosition="3136">to H4. More interestingly, our data suggests that women use AAVE features more often than men, i.e., there is a negative correlation between male gender and AAVE features, contrary to the second half of H3, namely that AAVE is more frequently appropriated by men. Note, however, that our gender ratios are aggregated for city areas, and with the demographic bias of Twitter, these correlations should be taken with a grain of salt. Considering the small gender ratio differences, we also compute correlations between our linguistic features and gender using the Rovereto Twitter N-gram Corpus (RTC) (Herdagdelen and Baroni, 2011).4 The RTC corpus contains information about the gender of the tweeter associated with n-grams. While there is too little data in the corpus to correlate gender and backing in /str/, derhotacization and both interdental fricative mutations (/D/ → /d/ or /v/ and /T/ → /t/ or /f/) correlate significantly with women. Out of our words, 10 correlate sig4http://clic.cimec.unitn.it/amac/ twitter_ngram/ 14 Feature word pairs latitude longitude urban Gulf – = p &gt; 0.05, * = 0.05 &gt; p &gt; 0.01, ** = p &lt; 0.01, *** = p &lt; 0.0001 Shading corresponds to negative correlations Table 2: Geographic correlations nifi</context>
</contexts>
<marker>Herdagdelen, Baroni, 2011</marker>
<rawString>Amac Herdagdelen and Marco Baroni. 2011. Stereotypical gender actions can be extracted from web text. Journal of the American Society for Information Science and Technology, 62:1741–1749.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Dirk Hovy</author>
<author>Anders Søgaard</author>
</authors>
<title>Tagging performance correlates with author age.</title>
<date>2015</date>
<booktitle>In ACL.</booktitle>
<contexts>
<context position="1979" citStr="Hovy and Søgaard, 2015" startWordPosition="302" endWordPosition="305">etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets every day, and some fraction of these tweets are geo-located. Eisenstein (2013) and Doyle (2014) studied t</context>
</contexts>
<marker>Hovy, Søgaard, 2015</marker>
<rawString>Dirk Hovy and Anders Søgaard. 2015. Tagging performance correlates with author age. In ACL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Dirk Hovy</author>
<author>Anders Johannsen</author>
<author>Anders Søgaard</author>
</authors>
<title>User review-sites as a source for large-scale sociolinguistic studies.</title>
<date>2015</date>
<booktitle>In WWW.</booktitle>
<contexts>
<context position="1909" citStr="Hovy et al., 2015" startWordPosition="290" endWordPosition="293"> represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets every day, and some fraction of thes</context>
</contexts>
<marker>Hovy, Johannsen, Søgaard, 2015</marker>
<rawString>Dirk Hovy, Anders Johannsen, and Anders Søgaard. 2015. User review-sites as a source for large-scale sociolinguistic studies. In WWW.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Anders Johannsen</author>
<author>Dirk Hovy</author>
<author>Anders Søgaard</author>
</authors>
<title>Cross-lingual syntactic variation over age and gender.</title>
<date>2015</date>
<booktitle>In CoNLL.</booktitle>
<contexts>
<context position="1955" citStr="Johannsen et al., 2015" startWordPosition="298" endWordPosition="301">rk City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets every day, and some fraction of these tweets are geo-located. Eisenstein (2013) an</context>
</contexts>
<marker>Johannsen, Hovy, Søgaard, 2015</marker>
<rawString>Anders Johannsen, Dirk Hovy, and Anders Søgaard. 2015. Cross-lingual syntactic variation over age and gender. In CoNLL.</rawString>
</citation>
<citation valid="true">
<authors>
<author>William Labov</author>
<author>Sharon Ash</author>
<author>Charles Boberg</author>
</authors>
<date>2005</date>
<journal>The Atlas of North American English Phonetics, Phonology and Sound Change. Mouton de Gruyter,</journal>
<location>New York, NY.</location>
<contexts>
<context position="1170" citStr="Labov et al., 2005" startWordPosition="173" endWordPosition="176">inguistic hypotheses. To go beyond the spelling level, we require automatic analysis such as POS tagging, but social media language still challenges language technologies. We show how both newswire- and Twitter-adapted stateof-the-art POS taggers perform significantly worse on AAVE tweets, suggesting that large-scale dialect studies of language variation beyond the surface level are not feasible with out-ofthe-box NLP tools. 1 Introduction Dialectal and sociolinguistic studies are traditionally based on interviews of small sets of speakers of each variety. The Atlas of North American English (Labov et al., 2005) has been the reference point for American dialectology since its completion, but is based on only 762 speakers. Dallas is represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect</context>
<context position="5808" citStr="Labov et al., 2005" startWordPosition="882" endWordPosition="885">results. • Further, we show that state-of-the-art newswire and Twitter POS taggers perform much worse on tweets containing AAVE features. This suggests an additional limitation to large-scale sociolinguistic research using social media data, namely that it is hard to analyze variation beyond the lexical level with current tools. 1.2 Sociolinguistic hypotheses AAVE is, in contrast to other North American dialects, not geographically restricted. Although variation in AAVE does exist, AAVE in urban settings has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov </context>
</contexts>
<marker>Labov, Ash, Boberg, 2005</marker>
<rawString>William Labov, Sharon Ash, and Charles Boberg. 2005. The Atlas of North American English Phonetics, Phonology and Sound Change. Mouton de Gruyter, New York, NY.</rawString>
</citation>
<citation valid="true">
<authors>
<author>William Labov</author>
</authors>
<title>Language in the Inner City: Studies in the Black English Vernacular.</title>
<date>1972</date>
<publisher>University of Pennsylvania Press.</publisher>
<contexts>
<context position="7239" citStr="Labov, 1972" startWordPosition="1101" endWordPosition="1102"> (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our work differs from previous work by studying phonological rules associated with specific dialects, as well as considering a wide range of actual sociolinguistic research hypotheses, but our main focus is the methodological problems doing this kind of work, as well as assessing the limitations of such work. 1.3 Methodological problems One obvious challenge relating social media data to sociolinguistic studies is that there is generally not a one-to-one relationship between phonological variation and spelling variation. People, in other words, do not spell the way they pro</context>
<context position="13618" citStr="Labov, 1972" startWordPosition="2141" endWordPosition="2142">s, /r/ is either not pronounced or is approximated as a vocalization in the surface form, when /r/ is in a pre-vocalic position. This can result in an elongation of the preceding vowel or in an off-glide schwa /@/, e.g., guard → /gA:d/, car → /ka:/, fear → /fi@/ (Thomas, 2007). Backing in /skr/ denotes the substitution 12 of /str/ for /skr/ in word-initial positions resulting in pronunciations such as /skrit/ for street, /skraq/ for strong and /skrT/ for strip. Backing in /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans than among European Americans and that the frequency of the mutations is inversely correlated with socio-economic levels and formality of speaking (Rickford, 1999). We follow Eisenstein (2013) and Doyle (2014) in assuming that spelling variation may be a result of phonological differences and select 25 word pairs for our study (Tab</context>
</contexts>
<marker>Labov, 1972</marker>
<rawString>William Labov. 1972a. Language in the Inner City: Studies in the Black English Vernacular. University of Pennsylvania Press.</rawString>
</citation>
<citation valid="true">
<authors>
<author>William Labov</author>
</authors>
<title>Sociolingustic Patterns.</title>
<date>1972</date>
<publisher>University of Pennsylvania Press,</publisher>
<location>Philadelphia, PA.</location>
<contexts>
<context position="7239" citStr="Labov, 1972" startWordPosition="1101" endWordPosition="1102"> (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our work differs from previous work by studying phonological rules associated with specific dialects, as well as considering a wide range of actual sociolinguistic research hypotheses, but our main focus is the methodological problems doing this kind of work, as well as assessing the limitations of such work. 1.3 Methodological problems One obvious challenge relating social media data to sociolinguistic studies is that there is generally not a one-to-one relationship between phonological variation and spelling variation. People, in other words, do not spell the way they pro</context>
<context position="13618" citStr="Labov, 1972" startWordPosition="2141" endWordPosition="2142">s, /r/ is either not pronounced or is approximated as a vocalization in the surface form, when /r/ is in a pre-vocalic position. This can result in an elongation of the preceding vowel or in an off-glide schwa /@/, e.g., guard → /gA:d/, car → /ka:/, fear → /fi@/ (Thomas, 2007). Backing in /skr/ denotes the substitution 12 of /str/ for /skr/ in word-initial positions resulting in pronunciations such as /skrit/ for street, /skraq/ for strong and /skrT/ for strip. Backing in /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans than among European Americans and that the frequency of the mutations is inversely correlated with socio-economic levels and formality of speaking (Rickford, 1999). We follow Eisenstein (2013) and Doyle (2014) in assuming that spelling variation may be a result of phonological differences and select 25 word pairs for our study (Tab</context>
</contexts>
<marker>Labov, 1972</marker>
<rawString>William Labov. 1972b. Sociolingustic Patterns. University of Pennsylvania Press, Philadelphia, PA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>William Labov</author>
</authors>
<title>The intersection of sex and social class in the course of linguistic change.</title>
<date>1990</date>
<journal>Language Variation and Change,</journal>
<volume>2</volume>
<pages>7</pages>
<contexts>
<context position="23230" citStr="Labov, 1990" startWordPosition="3833" endWordPosition="3834">reet skrong/strong skrip/strip total *** *** *** *** *** *** *** *** ** *** *** *** – * *** *** *** *** – ** – *** sista/sister – *** /str/ – brova/brother *** *** *** *** dat/that *** * – *** deez/these****– – /D/ dem/them *** *** – *** dey/they *** *** – *** dis/this *** – – *** mova/mother * *** *** *** mouf/mouth *** – – *** nuffin/nothing *** *** *** *** souf/south *** *** *** *** teef/teeth ** – ** *** trough/through – *** total * *** *** *** /T/ – – foreva/forever hea/here *** *** ** *** *** 15 established that women tend to be more linguistically creative and have larger vocabularies (Labov, 1990; Brizendine, 2006). Whether women are also more meta-linguistic (METAUSE BIAS), has to the best of our knowledge not been studied. Since genders are almost equally geographically distributed, and since Twitter is generally considered genderbalanced, neither USER POPULATION BIAS nor GALTON’S PROBLEM is likely to bias our conclusions. TOPIC BIAS, on the other hand, may. While our semantically equivalent pairs control for topic, the pragmatics sometimes differ. Just like code-switching is a strategy for bilinguals, using the spelling motha instead of mother could mean something, say irony, which</context>
</contexts>
<marker>Labov, 1990</marker>
<rawString>William Labov. 1990. The intersection of sex and social class in the course of linguistic change. Language Variation and Change, 2:205–254, 7.</rawString>
</citation>
<citation valid="true">
<authors>
<author>William Labov</author>
</authors>
<title>Unendangered dialects, endangered people.</title>
<date>2006</date>
<pages>06</pages>
<editor>In Natalie Schilling-Estes, editor,</editor>
<contexts>
<context position="5821" citStr="Labov, 2006" startWordPosition="886" endWordPosition="887">we show that state-of-the-art newswire and Twitter POS taggers perform much worse on tweets containing AAVE features. This suggests an additional limitation to large-scale sociolinguistic research using social media data, namely that it is hard to analyze variation beyond the lexical level with current tools. 1.2 Sociolinguistic hypotheses AAVE is, in contrast to other North American dialects, not geographically restricted. Although variation in AAVE does exist, AAVE in urban settings has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005;</context>
</contexts>
<marker>Labov, 2006</marker>
<rawString>William Labov. 2006. Unendangered dialects, endangered people. In Natalie Schilling-Estes, editor, GURT’06.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Miriam Meyerhof</author>
</authors>
<title>Introducing Sociolinguistics.</title>
<date>2006</date>
<publisher>Routledge.</publisher>
<contexts>
<context position="7257" citStr="Meyerhof, 2006" startWordPosition="1103" endWordPosition="1104">l., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our work differs from previous work by studying phonological rules associated with specific dialects, as well as considering a wide range of actual sociolinguistic research hypotheses, but our main focus is the methodological problems doing this kind of work, as well as assessing the limitations of such work. 1.3 Methodological problems One obvious challenge relating social media data to sociolinguistic studies is that there is generally not a one-to-one relationship between phonological variation and spelling variation. People, in other words, do not spell the way they pronounce. Eisenstein</context>
</contexts>
<marker>Meyerhof, 2006</marker>
<rawString>Miriam Meyerhof. 2006. Introducing Sociolinguistics. Routledge.</rawString>
</citation>
<citation valid="true">
<authors>
<author>R Naroll</author>
</authors>
<title>Two solutions to Galton’s problem.</title>
<date>1961</date>
<journal>Philosophy of Science,</journal>
<volume>28</volume>
<contexts>
<context position="9632" citStr="Naroll, 1961" startWordPosition="1494" endWordPosition="1495">ic information about the individual Twitter user, we can use the geo-located tweets to study the correlation between socio-economic variables and linguistic features at the level of cities or ZIP codes.1 Eisenstein et al. (2011) note that this level of abstraction introduces some noise. Since Twitter users do not form representative samples of the population, the mean income for a city or ZIP code is not necessarily the mean income for the Twitter users in that area. We refer to this problem as the (3) USER POPULATION BIAS. Another serious methodological problem known as (4) GALTON’S PROBLEM (Naroll, 1961; Roberts and Winters, 2013), is the observation that cross-cultural associations are 1Unlike many others, we rely on physical locations rather than user-entered profile locations. See Graham et al. (2014) for discussion. 11 often explained by geographical diffusion. In other words, it is the problem of discriminating historical from functional associations in cross-cultural surveys. Briefly put, when we sample tweets and income-levels from US cities, there is little independence between the city data points. Linguistic features diffuse geographically and do not change at random, and we can th</context>
<context position="11100" citStr="Naroll, 1961" startWordPosition="1716" endWordPosition="1717">ogical pattern may also have cultural, religious or geographical explanations. Reasons to be less worried about GALTON’S PROBLEM in our case, include that a) we only consider standard hypotheses from the sociolinguistics literature and not a huge set of previously unexplored, automatically generated hypotheses, b) we sample data points at random from all across the US, giving us a very sparse distribution compared to country-level data, but more notably, c) location is an important, explicit variable in our study. GALTON’S PROBLEM is typically identified by clustering tests based on location (Naroll, 1961). Obviously, the phonological features considered here cluster geographically, as evidenced by our geographic correlations in Table 2, but since our studies explicitly test the influence of location, it is not the case for most of the hypotheses considered here that geographic diffusion is the underlying explanation for something else. In §3, we discuss whether these four methodological problems compromise the validity of our findings. One other methodological problems that may be relevant for other studies of dialect in social media, is almost completely irrelevant for our study: It is often </context>
</contexts>
<marker>Naroll, 1961</marker>
<rawString>R Naroll. 1961. Two solutions to Galton’s problem. Philosophy of Science, 28.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Slav Petrov</author>
<author>Dipanjan Das</author>
<author>Ryan McDonald</author>
</authors>
<title>A universal part-of-speech tagset.</title>
<date>2011</date>
<note>CoRR abs/1104.2086.</note>
<contexts>
<context position="24456" citStr="Petrov et al. (2011)" startWordPosition="4025" endWordPosition="4028">ender is more prone for. In sum, while we do believe that our data should lead sociolinguists to question whether AAVE is male-dominated, our findings may be biased by WRITTEN BIAS. 4 POS tagging We need automated syntactic analysis to study morpho-syntactic dialectal variation. We ran a state-of-the-art POS tagger trained on newswire5 (STANFORD), as well as two stateof-the-art POS taggers adapted to Twitter, namely GATE6 and ARK7, on our data. We had one professional annotator manually annotate 100 positive (AAVE) and 100 negative (non-AAVE) sentences using the coarsegrained tags proposed by Petrov et al. (2011). We map the tagger outputs to those tags and report tagging accuracies. See Table 5 for results, with A(+, −) being the absolute difference in performance from non-AAVE to AAVE. 5http://nlp.stanford.edu/software/ tagger.shtml 6https://gate.ac.uk/wiki/ twitter-postagger.html 7http://www.ark.cs.cmu.edu/ TweetNLP/ STANFORD GATE ARK AAVE 61.4 79.1 77.5 non-AAVE 74.5 83.3 77.9 Δ(+,-) 13.1 4.2 0.4 Table 5: POS tagging accuracies (%) While GATE is certainly better than STANFORD on our data, performance is generally poor and prohibitive of many downstream applications and variational studies. We also</context>
</contexts>
<marker>Petrov, Das, McDonald, 2011</marker>
<rawString>Slav Petrov, Dipanjan Das, and Ryan McDonald. 2011. A universal part-of-speech tagset. CoRR abs/1104.2086.</rawString>
</citation>
<citation valid="true">
<authors>
<author>K E Pollock</author>
<author>G Bailey</author>
<author>M Berni</author>
<author>D Fletcher</author>
<author>L Hinton</author>
<author>I Johnson</author>
<author>J Roberts</author>
<author>R Weaver</author>
</authors>
<date>1998</date>
<note>Phonological features of african american english. http://www.rehabmed.ualberta.ca/spa/phonology/features.htm.</note>
<contexts>
<context position="6650" citStr="Pollock et al., 1998" startWordPosition="1010" endWordPosition="1014"> features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999). H5: Derhotacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof</context>
</contexts>
<marker>Pollock, Bailey, Berni, Fletcher, Hinton, Johnson, Roberts, Weaver, 1998</marker>
<rawString>K.E. Pollock, G. Bailey, M. Berni, D. Fletcher, L. Hinton, I. Johnson, J. Roberts, and R. Weaver. 1998. Phonological features of african american english. http://www.rehabmed.ualberta.ca/spa/phonology/features.htm.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Delip Rao</author>
<author>David Yarowsky</author>
<author>Abhishek Shreevats</author>
<author>Manaswi Gupta</author>
</authors>
<title>Classifying latent user attributes in twitter.</title>
<date>2010</date>
<booktitle>In Proceedings of the 2nd International Workshop on Search and Mining User-generated Contents,</booktitle>
<pages>37--44</pages>
<publisher>ACM.</publisher>
<contexts>
<context position="1837" citStr="Rao et al., 2010" startWordPosition="278" endWordPosition="281">logy since its completion, but is based on only 762 speakers. Dallas is represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter p</context>
</contexts>
<marker>Rao, Yarowsky, Shreevats, Gupta, 2010</marker>
<rawString>Delip Rao, David Yarowsky, Abhishek Shreevats, and Manaswi Gupta. 2010. Classifying latent user attributes in twitter. In Proceedings of the 2nd International Workshop on Search and Mining User-generated Contents, pages 37–44. ACM.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sonya Rastogi</author>
<author>Tallese D Johnson</author>
<author>Elizabeth M Hoeffel</author>
<author>Malcolm P Drewery Jr</author>
</authors>
<title>The black population:</title>
<date>2011</date>
<tech>Technical report, US Census,</tech>
<contexts>
<context position="6136" citStr="Rastogi et al., 2011" startWordPosition="935" endWordPosition="938">ools. 1.2 Sociolinguistic hypotheses AAVE is, in contrast to other North American dialects, not geographically restricted. Although variation in AAVE does exist, AAVE in urban settings has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999). H5: Derhotacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf</context>
</contexts>
<marker>Rastogi, Johnson, Hoeffel, Jr, 2011</marker>
<rawString>Sonya Rastogi, Tallese D. Johnson, Elizabeth M. Hoeffel, and Malcolm P. Drewery Jr. 2011. The black population: 2010. Technical report, US Census, September.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John Rickford</author>
</authors>
<title>African American Vernacular English: Features, Evolution, Educational Implications.</title>
<date>1999</date>
<publisher>Blackwell,</publisher>
<location>Malden, MA.</location>
<contexts>
<context position="6004" citStr="Rickford, 1999" startWordPosition="913" endWordPosition="914">guistic research using social media data, namely that it is hard to analyze variation beyond the lexical level with current tools. 1.2 Sociolinguistic hypotheses AAVE is, in contrast to other North American dialects, not geographically restricted. Although variation in AAVE does exist, AAVE in urban settings has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999). H5: Derhotacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in </context>
<context position="13605" citStr="Rickford, 1999" startWordPosition="2139" endWordPosition="2140">n-rhotic dialects, /r/ is either not pronounced or is approximated as a vocalization in the surface form, when /r/ is in a pre-vocalic position. This can result in an elongation of the preceding vowel or in an off-glide schwa /@/, e.g., guard → /gA:d/, car → /ka:/, fear → /fi@/ (Thomas, 2007). Backing in /skr/ denotes the substitution 12 of /str/ for /skr/ in word-initial positions resulting in pronunciations such as /skrit/ for street, /skraq/ for strong and /skrT/ for strip. Backing in /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans than among European Americans and that the frequency of the mutations is inversely correlated with socio-economic levels and formality of speaking (Rickford, 1999). We follow Eisenstein (2013) and Doyle (2014) in assuming that spelling variation may be a result of phonological differences and select 25 word pairs for o</context>
<context position="20413" citStr="Rickford, 1999" startWordPosition="3334" endWordPosition="3335">hic correlations nificantly with female speakers; seven with male. The correlations are found in Table 4. For each feature, certain words correlate significantly with female speakers, while others correlate significantly with male speakers. Consequently, neither our Twitter data not the Twitter data in the RTC suggest that AAVE is more often appropriated by men. We discuss whether our data provides a basis for falsifying the second half of H3 in §3.1. The high correlation between mutations of /D/ and longitude supports the presence of these mutations of /D/ in non-standard northern varieties (Rickford, 1999). The mutation of /T/ is also correlated with longitude, and with latitude, suggesting an Eastern American feature rather than a distinct Southern feature (Rickford, 1999). The variation in mutations could possibly be explained by both geography as well as the distribution og African Americans. There is evidence in our data that backing in /str/ (to /skr/) is appropriated more often by AAVE speakers than by speakers of other dialects (H8). There is also a negative correlation between latitude and backing in /str/ as well as a strong positive correlation with the Gulf states, suggesting that ba</context>
</contexts>
<marker>Rickford, 1999</marker>
<rawString>John Rickford. 1999. African American Vernacular English: Features, Evolution, Educational Implications. Blackwell, Malden, MA.</rawString>
</citation>
<citation valid="true">
<authors>
<author>John Rickford</author>
</authors>
<title>Geographical diversity, residential segregation, and the vitality of african american vernacular english and its speakers.</title>
<date>2010</date>
<journal>Transforming Anthropology,</journal>
<volume>18</volume>
<issue>1</issue>
<contexts>
<context position="6314" citStr="Rickford, 2010" startWordPosition="963" endWordPosition="964"> has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999). H5: Derhotacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the </context>
</contexts>
<marker>Rickford, 2010</marker>
<rawString>John Rickford. 2010. Geographical diversity, residential segregation, and the vitality of african american vernacular english and its speakers. Transforming Anthropology, 18(1):28–34.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Sean Roberts</author>
<author>James Winters</author>
</authors>
<title>Linguistic diversity and traffic accidents: lessons from statistical studies of cultural traits.</title>
<date>2013</date>
<journal>PLoS ONE,</journal>
<volume>8</volume>
<issue>8</issue>
<contexts>
<context position="9660" citStr="Roberts and Winters, 2013" startWordPosition="1496" endWordPosition="1499"> about the individual Twitter user, we can use the geo-located tweets to study the correlation between socio-economic variables and linguistic features at the level of cities or ZIP codes.1 Eisenstein et al. (2011) note that this level of abstraction introduces some noise. Since Twitter users do not form representative samples of the population, the mean income for a city or ZIP code is not necessarily the mean income for the Twitter users in that area. We refer to this problem as the (3) USER POPULATION BIAS. Another serious methodological problem known as (4) GALTON’S PROBLEM (Naroll, 1961; Roberts and Winters, 2013), is the observation that cross-cultural associations are 1Unlike many others, we rely on physical locations rather than user-entered profile locations. See Graham et al. (2014) for discussion. 11 often explained by geographical diffusion. In other words, it is the problem of discriminating historical from functional associations in cross-cultural surveys. Briefly put, when we sample tweets and income-levels from US cities, there is little independence between the city data points. Linguistic features diffuse geographically and do not change at random, and we can therefore expect to see more s</context>
</contexts>
<marker>Roberts, Winters, 2013</marker>
<rawString>Sean Roberts and James Winters. 2013. Linguistic diversity and traffic accidents: lessons from statistical studies of cultural traits. PLoS ONE, 8(8).</rawString>
</citation>
<citation valid="true">
<authors>
<author>Eric Thomas</author>
</authors>
<title>Phonological and phonetic characteristics of african american vernacular english.</title>
<date>2007</date>
<journal>Language and Linguistic Compass,</journal>
<volume>1</volume>
<issue>5</issue>
<contexts>
<context position="6665" citStr="Thomas, 2007" startWordPosition="1015" endWordPosition="1016">sent in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999). H5: Derhotacization is negatively correlated with income and educational level (Rickford, 1999). H6: Interdental fricative mutation is more frequent in AAVE than in European American speech (Pollock et al., 1998; Thomas, 2007). H7: Interdental fricative mutation is predominantly found in the Gulf states (Rastogi et al., 2011). H8: Backing in /str/ (to /skr/) is unique to AAVE (Rickford, 1999; Thomas, 2007; Labov, 2006). Hypotheses 1–8 are investigated by correlating the distribution of phonological variants in geo-located tweets with demographic information. Our method is similar to those proposed by Eisenstein (2013) and Doyle (2014), lending statistical power to sociolinguistic analyses, and circumventing traditional issues with data collection such as the Observer’s Paradox (Labov, 1972b; Meyerhof, 2006). Our wo</context>
<context position="13284" citStr="Thomas, 2007" startWordPosition="2085" endWordPosition="2086">tion, backing in /str/, and interdental fricative mutation. Specifically, we collect data to study the following four phonological variations (the latter two are both instances of interdental fricative mutation): a) derhotacization: /r/ → /Ø/ or /@/, b) /str/ → /skr/, c) /D/ → /d/ or /v/ and, d) /T/ → /t/ or /f/. In non-rhotic dialects, /r/ is either not pronounced or is approximated as a vocalization in the surface form, when /r/ is in a pre-vocalic position. This can result in an elongation of the preceding vowel or in an off-glide schwa /@/, e.g., guard → /gA:d/, car → /ka:/, fear → /fi@/ (Thomas, 2007). Backing in /skr/ denotes the substitution 12 of /str/ for /skr/ in word-initial positions resulting in pronunciations such as /skrit/ for street, /skraq/ for strong and /skrT/ for strip. Backing in /str/ has been reported to be a unique feature in AAVE, as it is unheard in other North American dialects (Rickford, 1999; Labov, 1972a; Thomas, 2007). The two interdental fricative mutations relate to substitutions of /d/ and /0/ by /d/, /v/ and /t/, /f/ in words such as that and mother or nothing and with. It has been reported that mutations of /d/ and /0/ are more common among African Americans</context>
</contexts>
<marker>Thomas, 2007</marker>
<rawString>Eric Thomas. 2007. Phonological and phonetic characteristics of african american vernacular english. Language and Linguistic Compass, 1(5):450–475.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Svitlana Volkova</author>
<author>Theresa Wilson</author>
<author>David Yarowsky</author>
</authors>
<title>Exploring demographic language variations to improve multilingual sentiment analysis in social media.</title>
<date>2013</date>
<booktitle>In EMNLP.</booktitle>
<contexts>
<context position="1877" citStr="Volkova et al., 2013" startWordPosition="284" endWordPosition="287">sed on only 762 speakers. Dallas is represented by four subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets ever</context>
</contexts>
<marker>Volkova, Wilson, Yarowsky, 2013</marker>
<rawString>Svitlana Volkova, Theresa Wilson, and David Yarowsky. 2013. Exploring demographic language variations to improve multilingual sentiment analysis in social media. In EMNLP.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Svitlana Volkova</author>
<author>Yoram Bachrach</author>
<author>Michael Armstrong</author>
<author>Vijay Sharma</author>
</authors>
<title>Inferring latent user properties from texts published in social media (demo).</title>
<date>2015</date>
<booktitle>In AAAI.</booktitle>
<contexts>
<context position="1931" citStr="Volkova et al., 2015" startWordPosition="294" endWordPosition="297">r subjects, the New York City dialect by six, etc. Data is costly to collect, and, as a consequence, scarce. Written language was traditionally used for formal purposes, and therefore differed in style from colloquial, spoken language. However, with the rise of social media platforms and the vast production of user generated content, differences between written and spoken language diminish. A number of recent papers have explored social media with respect to sociolinguistic and dialectological questions (Rao et al., 2010; Eisenstein, 2013; Volkova et al., 2013; Doyle, 2014; Hovy et al., 2015; Volkova et al., 2015; Johannsen et al., 2015; Hovy and Søgaard, 2015; Eisenstein, to appear). Emails, chats and social media posts serve purposes similar to those of spoken language, and consequently, features of spoken language, such as interjections, ellipses, and phonological variation, have found their way into this type of written language. Our work differs from most previous approaches by investigating several phonological spelling correlates of a specific language variety. The 284 million active users on Twitter post more than half a billion tweets every day, and some fraction of these tweets are geo-locat</context>
</contexts>
<marker>Volkova, Bachrach, Armstrong, Sharma, 2015</marker>
<rawString>Svitlana Volkova, Yoram Bachrach, Michael Armstrong, and Vijay Sharma. 2015. Inferring latent user properties from texts published in social media (demo). In AAAI.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Walt Wolfram</author>
</authors>
<title>The grammar of urban african american vernacular english.</title>
<date>2004</date>
<booktitle>Handbook of Varieties of English,</booktitle>
<pages>111--132</pages>
<editor>In Kormann B. and E. Schneider, editors,</editor>
<location>Berlin. Mouton</location>
<note>de Gruyter.</note>
<contexts>
<context position="5837" citStr="Wolfram, 2004" startWordPosition="888" endWordPosition="889">state-of-the-art newswire and Twitter POS taggers perform much worse on tweets containing AAVE features. This suggests an additional limitation to large-scale sociolinguistic research using social media data, namely that it is hard to analyze variation beyond the lexical level with current tools. 1.2 Sociolinguistic hypotheses AAVE is, in contrast to other North American dialects, not geographically restricted. Although variation in AAVE does exist, AAVE in urban settings has been established as a uniform system with suprasegmental norms (Ash and Myhill, 1986; Labov et al., 2005; Labov, 2006; Wolfram, 2004). This paper considers the following eight (8) hypotheses from the sociolinguistic literature about AAVE as a ethnolect: H1: AAVE is an urban ethnolect (Rickford, 1999; Wolfram, 2004). H2: AAVE features are more present in the Gulf states than in the rest of the United States (Rastogi et al., 2011). 10 H3: The likelihood of speaking AAVE correlates negatively with income and educational level, and AAVE is more frequently appropriated by men (Rickford, 1999; Rickford, 2010). H4: Derhotacization is more frequent in African Americans than in European Americans (Labov et al., 2005; Rickford, 1999)</context>
</contexts>
<marker>Wolfram, 2004</marker>
<rawString>Walt Wolfram. 2004. The grammar of urban african american vernacular english. In Kormann B. and E. Schneider, editors, Handbook of Varieties of English, pages 111–132, Berlin. Mouton de Gruyter.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>