@inproceedings{santiago-etal-2022-disambiguation,
title = "Disambiguation of morpho-syntactic features of {A}frican {A}merican {E}nglish {--} the case of habitual be",
author = "Santiago, Harrison and
Martin, Joshua and
Moeller, Sarah and
Tang, Kevin",
editor = "Chakravarthi, Bharathi Raja and
Bharathi, B and
McCrae, John P and
Zarrouk, Manel and
Bali, Kalika and
Buitelaar, Paul",
booktitle = "Proceedings of the Second Workshop on Language Technology for Equality, Diversity and Inclusion",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.ltedi-1.9",
doi = "10.18653/v1/2022.ltedi-1.9",
pages = "70--75",
abstract = "Recent research has highlighted that natural language processing (NLP) systems exhibit a bias againstAfrican American speakers. These errors are often caused by poor representation of linguistic features unique to African American English (AAE), which is due to the relatively low probability of occurrence for many such features. We present a workflow to overcome this issue in the case of habitual {``}be{''}. Habitual {``}be{''} is isomorphic, and therefore ambiguous, with other forms of uninflected {``}be{''} found in both AAE and General American English (GAE). This creates a clear challenge for bias in NLP technologies. To overcome the scarcity, we employ a combination of rule-based filters and data augmentation that generate a corpus balanced between habitual and non-habitual instances. This balanced corpus trains unbiased machine learning classifiers, as demonstrated on a corpus of AAE transcribed texts, achieving .65 F$_1$ score at classifying habitual {``}be{''}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santiago-etal-2022-disambiguation">
<titleInfo>
<title>Disambiguation of morpho-syntactic features of African American English – the case of habitual be</title>
</titleInfo>
<name type="personal">
<namePart type="given">Harrison</namePart>
<namePart type="family">Santiago</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Martin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Moeller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Technology for Equality, Diversity and Inclusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">B</namePart>
<namePart type="family">Bharathi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">P</namePart>
<namePart type="family">McCrae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manel</namePart>
<namePart type="family">Zarrouk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Buitelaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent research has highlighted that natural language processing (NLP) systems exhibit a bias againstAfrican American speakers. These errors are often caused by poor representation of linguistic features unique to African American English (AAE), which is due to the relatively low probability of occurrence for many such features. We present a workflow to overcome this issue in the case of habitual “be”. Habitual “be” is isomorphic, and therefore ambiguous, with other forms of uninflected “be” found in both AAE and General American English (GAE). This creates a clear challenge for bias in NLP technologies. To overcome the scarcity, we employ a combination of rule-based filters and data augmentation that generate a corpus balanced between habitual and non-habitual instances. This balanced corpus trains unbiased machine learning classifiers, as demonstrated on a corpus of AAE transcribed texts, achieving .65 F₁ score at classifying habitual “be”.</abstract>
<identifier type="citekey">santiago-etal-2022-disambiguation</identifier>
<identifier type="doi">10.18653/v1/2022.ltedi-1.9</identifier>
<location>
<url>https://aclanthology.org/2022.ltedi-1.9</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>70</start>
<end>75</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Disambiguation of morpho-syntactic features of African American English – the case of habitual be
%A Santiago, Harrison
%A Martin, Joshua
%A Moeller, Sarah
%A Tang, Kevin
%Y Chakravarthi, Bharathi Raja
%Y Bharathi, B.
%Y McCrae, John P.
%Y Zarrouk, Manel
%Y Bali, Kalika
%Y Buitelaar, Paul
%S Proceedings of the Second Workshop on Language Technology for Equality, Diversity and Inclusion
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F santiago-etal-2022-disambiguation
%X Recent research has highlighted that natural language processing (NLP) systems exhibit a bias againstAfrican American speakers. These errors are often caused by poor representation of linguistic features unique to African American English (AAE), which is due to the relatively low probability of occurrence for many such features. We present a workflow to overcome this issue in the case of habitual “be”. Habitual “be” is isomorphic, and therefore ambiguous, with other forms of uninflected “be” found in both AAE and General American English (GAE). This creates a clear challenge for bias in NLP technologies. To overcome the scarcity, we employ a combination of rule-based filters and data augmentation that generate a corpus balanced between habitual and non-habitual instances. This balanced corpus trains unbiased machine learning classifiers, as demonstrated on a corpus of AAE transcribed texts, achieving .65 F₁ score at classifying habitual “be”.
%R 10.18653/v1/2022.ltedi-1.9
%U https://aclanthology.org/2022.ltedi-1.9
%U https://doi.org/10.18653/v1/2022.ltedi-1.9
%P 70-75
Markdown (Informal)
[Disambiguation of morpho-syntactic features of African American English – the case of habitual be](https://aclanthology.org/2022.ltedi-1.9) (Santiago et al., LTEDI 2022)
ACL