@inproceedings{dhar-das-2021-leveraging,
title = "Leveraging Expectation Maximization for Identifying Claims in Low Resource {I}ndian Languages",
author = "Dhar, Rudra and
Das, Dipankar",
editor = "Bandyopadhyay, Sivaji and
Devi, Sobha Lalitha and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2021",
address = "National Institute of Technology Silchar, Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.icon-main.37/",
pages = "307--312",
abstract = "Identification of the checkable claims is one of the important prior tasks while dealing with infinite amount of data streaming from social web and the task becomes a compulsory one when we analyze them on behalf of a multilingual country like India that contains more than 1 billion people. In the present work, we describe our system which is made for detecting check-worthy claim sentences in resource scarce Indian languages (e.g., Bengali and Hindi). Firstly, we collected sentences from various sources in Bengali and Hindi and vectorized them with several NLP features. We labeled a small portion of them for check-worthy claims manually. However, in order to label rest amount of data in a semi-supervised fashion, we employed the Expectation Maximization (EM) algorithm tuned with the Multivariate Gaussian Mixture Model (GMM) to assign weakly labels. The optimal number of Gaussians in this algorithm is traced by using Logistic Regression. Furthermore, we used different ratios of manually labeled data and weakly labeled data to train our various machine learning models. We tabulated and plotted the performances of the models along with the stepwise decrement in proportion of manually labeled data. The experimental results were at par with our theoretical understanding, and we conclude that the weakly labeling of check-worthy claim sentences in low resource languages with EM algorithm has true potential."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dhar-das-2021-leveraging">
<titleInfo>
<title>Leveraging Expectation Maximization for Identifying Claims in Low Resource Indian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rudra</namePart>
<namePart type="family">Dhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dipankar</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sivaji</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="given">Lalitha</namePart>
<namePart type="family">Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">National Institute of Technology Silchar, Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Identification of the checkable claims is one of the important prior tasks while dealing with infinite amount of data streaming from social web and the task becomes a compulsory one when we analyze them on behalf of a multilingual country like India that contains more than 1 billion people. In the present work, we describe our system which is made for detecting check-worthy claim sentences in resource scarce Indian languages (e.g., Bengali and Hindi). Firstly, we collected sentences from various sources in Bengali and Hindi and vectorized them with several NLP features. We labeled a small portion of them for check-worthy claims manually. However, in order to label rest amount of data in a semi-supervised fashion, we employed the Expectation Maximization (EM) algorithm tuned with the Multivariate Gaussian Mixture Model (GMM) to assign weakly labels. The optimal number of Gaussians in this algorithm is traced by using Logistic Regression. Furthermore, we used different ratios of manually labeled data and weakly labeled data to train our various machine learning models. We tabulated and plotted the performances of the models along with the stepwise decrement in proportion of manually labeled data. The experimental results were at par with our theoretical understanding, and we conclude that the weakly labeling of check-worthy claim sentences in low resource languages with EM algorithm has true potential.</abstract>
<identifier type="citekey">dhar-das-2021-leveraging</identifier>
<location>
<url>https://aclanthology.org/2021.icon-main.37/</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>307</start>
<end>312</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Leveraging Expectation Maximization for Identifying Claims in Low Resource Indian Languages
%A Dhar, Rudra
%A Das, Dipankar
%Y Bandyopadhyay, Sivaji
%Y Devi, Sobha Lalitha
%Y Bhattacharyya, Pushpak
%S Proceedings of the 18th International Conference on Natural Language Processing (ICON)
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C National Institute of Technology Silchar, Silchar, India
%F dhar-das-2021-leveraging
%X Identification of the checkable claims is one of the important prior tasks while dealing with infinite amount of data streaming from social web and the task becomes a compulsory one when we analyze them on behalf of a multilingual country like India that contains more than 1 billion people. In the present work, we describe our system which is made for detecting check-worthy claim sentences in resource scarce Indian languages (e.g., Bengali and Hindi). Firstly, we collected sentences from various sources in Bengali and Hindi and vectorized them with several NLP features. We labeled a small portion of them for check-worthy claims manually. However, in order to label rest amount of data in a semi-supervised fashion, we employed the Expectation Maximization (EM) algorithm tuned with the Multivariate Gaussian Mixture Model (GMM) to assign weakly labels. The optimal number of Gaussians in this algorithm is traced by using Logistic Regression. Furthermore, we used different ratios of manually labeled data and weakly labeled data to train our various machine learning models. We tabulated and plotted the performances of the models along with the stepwise decrement in proportion of manually labeled data. The experimental results were at par with our theoretical understanding, and we conclude that the weakly labeling of check-worthy claim sentences in low resource languages with EM algorithm has true potential.
%U https://aclanthology.org/2021.icon-main.37/
%P 307-312
Markdown (Informal)
[Leveraging Expectation Maximization for Identifying Claims in Low Resource Indian Languages](https://aclanthology.org/2021.icon-main.37/) (Dhar & Das, ICON 2021)
ACL