@inproceedings{abboud-oz-2024-towards,
title = "Towards Equitable Natural Language Understanding Systems for Dialectal Cohorts: Debiasing Training Data",
author = "Abboud, Khadige and
Oz, Gokmen",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1433",
pages = "16487--16499",
abstract = "Despite being widely spoken, dialectal variants of languages are frequently considered low in resources due to lack of writing standards and orthographic inconsistencies. As a result, training natural language understanding (NLU) systems relies primarily on standard language resources leading to biased and inequitable NLU technology that underserves dialectal speakers. In this paper, we propose to address this problem through a framework composed of a dialect identification model that is used to obtain targeted training data augmentation for under-represented dialects, in an effort to debias NLU model for dialectal cohorts in NLU systems. We conduct experiments on two dialect rich non-English languages: Arabic and German, using large-scale commercial NLU datasets as well as open-source datasets. Results show that such framework can provide insights on dialect disparity in real-world NLU systems and targeted data argumentation can help narrow the model{'}s performance gap between standard language speakers and dialect speakers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abboud-oz-2024-towards">
<titleInfo>
<title>Towards Equitable Natural Language Understanding Systems for Dialectal Cohorts: Debiasing Training Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Khadige</namePart>
<namePart type="family">Abboud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokmen</namePart>
<namePart type="family">Oz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite being widely spoken, dialectal variants of languages are frequently considered low in resources due to lack of writing standards and orthographic inconsistencies. As a result, training natural language understanding (NLU) systems relies primarily on standard language resources leading to biased and inequitable NLU technology that underserves dialectal speakers. In this paper, we propose to address this problem through a framework composed of a dialect identification model that is used to obtain targeted training data augmentation for under-represented dialects, in an effort to debias NLU model for dialectal cohorts in NLU systems. We conduct experiments on two dialect rich non-English languages: Arabic and German, using large-scale commercial NLU datasets as well as open-source datasets. Results show that such framework can provide insights on dialect disparity in real-world NLU systems and targeted data argumentation can help narrow the model’s performance gap between standard language speakers and dialect speakers.</abstract>
<identifier type="citekey">abboud-oz-2024-towards</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1433</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>16487</start>
<end>16499</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Equitable Natural Language Understanding Systems for Dialectal Cohorts: Debiasing Training Data
%A Abboud, Khadige
%A Oz, Gokmen
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F abboud-oz-2024-towards
%X Despite being widely spoken, dialectal variants of languages are frequently considered low in resources due to lack of writing standards and orthographic inconsistencies. As a result, training natural language understanding (NLU) systems relies primarily on standard language resources leading to biased and inequitable NLU technology that underserves dialectal speakers. In this paper, we propose to address this problem through a framework composed of a dialect identification model that is used to obtain targeted training data augmentation for under-represented dialects, in an effort to debias NLU model for dialectal cohorts in NLU systems. We conduct experiments on two dialect rich non-English languages: Arabic and German, using large-scale commercial NLU datasets as well as open-source datasets. Results show that such framework can provide insights on dialect disparity in real-world NLU systems and targeted data argumentation can help narrow the model’s performance gap between standard language speakers and dialect speakers.
%U https://aclanthology.org/2024.lrec-main.1433
%P 16487-16499
Markdown (Informal)
[Towards Equitable Natural Language Understanding Systems for Dialectal Cohorts: Debiasing Training Data](https://aclanthology.org/2024.lrec-main.1433) (Abboud & Oz, LREC-COLING 2024)
ACL