@inproceedings{palakodety-khudabukhsh-2020-annotation,
title = "Annotation Efficient Language Identification from Weak Labels",
author = "Palakodety, Shriphani and
KhudaBukhsh, Ashiqur",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wnut-1.24",
doi = "10.18653/v1/2020.wnut-1.24",
pages = "181--192",
abstract = "India is home to several languages with more than 30m speakers. These languages exhibit significant presence on social media platforms. However, several of these widely-used languages are under-addressed by current Natural Language Processing (NLP) models and resources. User generated social media content in these languages is also typically authored in the Roman script as opposed to the traditional native script further contributing to resource scarcity. In this paper, we leverage a minimally supervised NLP technique to obtain weak language labels from a large-scale Indian social media corpus leading to a robust and annotation-efficient language-identification technique spanning nine Romanized Indian languages. In fast-spreading pandemic situations such as the current COVID-19 situation, information processing objectives might be heavily tilted towards under-served languages in densely populated regions. We release our models to facilitate downstream analyses in these low-resource languages. Experiments across multiple social media corpora demonstrate the model{'}s robustness and provide several interesting insights on Indian language usage patterns on social media. We release an annotated data set of 1,000 comments in ten Romanized languages as a social media evaluation benchmark.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palakodety-khudabukhsh-2020-annotation">
<titleInfo>
<title>Annotation Efficient Language Identification from Weak Labels</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shriphani</namePart>
<namePart type="family">Palakodety</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashiqur</namePart>
<namePart type="family">KhudaBukhsh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>India is home to several languages with more than 30m speakers. These languages exhibit significant presence on social media platforms. However, several of these widely-used languages are under-addressed by current Natural Language Processing (NLP) models and resources. User generated social media content in these languages is also typically authored in the Roman script as opposed to the traditional native script further contributing to resource scarcity. In this paper, we leverage a minimally supervised NLP technique to obtain weak language labels from a large-scale Indian social media corpus leading to a robust and annotation-efficient language-identification technique spanning nine Romanized Indian languages. In fast-spreading pandemic situations such as the current COVID-19 situation, information processing objectives might be heavily tilted towards under-served languages in densely populated regions. We release our models to facilitate downstream analyses in these low-resource languages. Experiments across multiple social media corpora demonstrate the model’s robustness and provide several interesting insights on Indian language usage patterns on social media. We release an annotated data set of 1,000 comments in ten Romanized languages as a social media evaluation benchmark.</abstract>
<identifier type="citekey">palakodety-khudabukhsh-2020-annotation</identifier>
<identifier type="doi">10.18653/v1/2020.wnut-1.24</identifier>
<location>
<url>https://aclanthology.org/2020.wnut-1.24</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>181</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Annotation Efficient Language Identification from Weak Labels
%A Palakodety, Shriphani
%A KhudaBukhsh, Ashiqur
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F palakodety-khudabukhsh-2020-annotation
%X India is home to several languages with more than 30m speakers. These languages exhibit significant presence on social media platforms. However, several of these widely-used languages are under-addressed by current Natural Language Processing (NLP) models and resources. User generated social media content in these languages is also typically authored in the Roman script as opposed to the traditional native script further contributing to resource scarcity. In this paper, we leverage a minimally supervised NLP technique to obtain weak language labels from a large-scale Indian social media corpus leading to a robust and annotation-efficient language-identification technique spanning nine Romanized Indian languages. In fast-spreading pandemic situations such as the current COVID-19 situation, information processing objectives might be heavily tilted towards under-served languages in densely populated regions. We release our models to facilitate downstream analyses in these low-resource languages. Experiments across multiple social media corpora demonstrate the model’s robustness and provide several interesting insights on Indian language usage patterns on social media. We release an annotated data set of 1,000 comments in ten Romanized languages as a social media evaluation benchmark.
%R 10.18653/v1/2020.wnut-1.24
%U https://aclanthology.org/2020.wnut-1.24
%U https://doi.org/10.18653/v1/2020.wnut-1.24
%P 181-192
Markdown (Informal)
[Annotation Efficient Language Identification from Weak Labels](https://aclanthology.org/2020.wnut-1.24) (Palakodety & KhudaBukhsh, WNUT 2020)
ACL