@inproceedings{shoemark-etal-2018-inducing,
title = "Inducing a lexicon of sociolinguistic variables from code-mixed text",
author = "Shoemark, Philippa and
Kirby, James and
Goldwater, Sharon",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the 2018 {EMNLP} Workshop W-{NUT}: The 4th Workshop on Noisy User-generated Text",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6101",
doi = "10.18653/v1/W18-6101",
pages = "1--6",
abstract = "Sociolinguistics is often concerned with how variants of a linguistic item (e.g., \textit{nothing} vs. \textit{nothin{'}}) are used by different groups or in different situations. We introduce the task of inducing lexical variables from code-mixed text: that is, identifying equivalence pairs such as (\textit{football}, \textit{fitba}) along with their linguistic code (\textit{football}→British, \textit{fitba}→Scottish). We adapt a framework for identifying gender-biased word pairs to this new task, and present results on three different pairs of English dialects, using tweets as the code-mixed text. Our system achieves precision of over 70{\%} for two of these three datasets, and produces useful results even without extensive parameter tuning. Our success in adapting this framework from gender to language variety suggests that it could be used to discover other types of analogous pairs as well.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shoemark-etal-2018-inducing">
<titleInfo>
<title>Inducing a lexicon of sociolinguistic variables from code-mixed text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philippa</namePart>
<namePart type="family">Shoemark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Kirby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharon</namePart>
<namePart type="family">Goldwater</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2018 EMNLP Workshop W-NUT: The 4th Workshop on Noisy User-generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Brussels, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sociolinguistics is often concerned with how variants of a linguistic item (e.g., nothing vs. nothin’) are used by different groups or in different situations. We introduce the task of inducing lexical variables from code-mixed text: that is, identifying equivalence pairs such as (football, fitba) along with their linguistic code (football→British, fitba→Scottish). We adapt a framework for identifying gender-biased word pairs to this new task, and present results on three different pairs of English dialects, using tweets as the code-mixed text. Our system achieves precision of over 70% for two of these three datasets, and produces useful results even without extensive parameter tuning. Our success in adapting this framework from gender to language variety suggests that it could be used to discover other types of analogous pairs as well.</abstract>
<identifier type="citekey">shoemark-etal-2018-inducing</identifier>
<identifier type="doi">10.18653/v1/W18-6101</identifier>
<location>
<url>https://aclanthology.org/W18-6101</url>
</location>
<part>
<date>2018-11</date>
<extent unit="page">
<start>1</start>
<end>6</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Inducing a lexicon of sociolinguistic variables from code-mixed text
%A Shoemark, Philippa
%A Kirby, James
%A Goldwater, Sharon
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the 2018 EMNLP Workshop W-NUT: The 4th Workshop on Noisy User-generated Text
%D 2018
%8 November
%I Association for Computational Linguistics
%C Brussels, Belgium
%F shoemark-etal-2018-inducing
%X Sociolinguistics is often concerned with how variants of a linguistic item (e.g., nothing vs. nothin’) are used by different groups or in different situations. We introduce the task of inducing lexical variables from code-mixed text: that is, identifying equivalence pairs such as (football, fitba) along with their linguistic code (football→British, fitba→Scottish). We adapt a framework for identifying gender-biased word pairs to this new task, and present results on three different pairs of English dialects, using tweets as the code-mixed text. Our system achieves precision of over 70% for two of these three datasets, and produces useful results even without extensive parameter tuning. Our success in adapting this framework from gender to language variety suggests that it could be used to discover other types of analogous pairs as well.
%R 10.18653/v1/W18-6101
%U https://aclanthology.org/W18-6101
%U https://doi.org/10.18653/v1/W18-6101
%P 1-6
Markdown (Informal)
[Inducing a lexicon of sociolinguistic variables from code-mixed text](https://aclanthology.org/W18-6101) (Shoemark et al., WNUT 2018)
ACL