@inproceedings{kumar-etal-2023-indisocialft,
title = "{I}ndi{S}ocial{FT}: Multilingual Word Representation for {I}ndian languages in code-mixed environment",
author = "Kumar, Saurabh and
Sanasam, Ranbir and
Nandi, Sukumar",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.252",
doi = "10.18653/v1/2023.findings-emnlp.252",
pages = "3866--3871",
abstract = "The increasing number of Indian language users on the internet necessitates the development of Indian language technologies. In response to this demand, our paper presents a generalized representation vector for diverse text characteristics, including native scripts, transliterated text, multilingual, code-mixed, and social media-related attributes. We gather text from both social media and well-formed sources and utilize the FastText model to create the {``}IndiSocialFT{''} embedding. Through intrinsic and extrinsic evaluation methods, we compare IndiSocialFT with three popular pretrained embeddings trained over Indian languages. Our findings show that the proposed embedding surpasses the baselines in most cases and languages, demonstrating its suitability for various NLP applications.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2023-indisocialft">
<titleInfo>
<title>IndiSocialFT: Multilingual Word Representation for Indian languages in code-mixed environment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saurabh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ranbir</namePart>
<namePart type="family">Sanasam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sukumar</namePart>
<namePart type="family">Nandi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The increasing number of Indian language users on the internet necessitates the development of Indian language technologies. In response to this demand, our paper presents a generalized representation vector for diverse text characteristics, including native scripts, transliterated text, multilingual, code-mixed, and social media-related attributes. We gather text from both social media and well-formed sources and utilize the FastText model to create the “IndiSocialFT” embedding. Through intrinsic and extrinsic evaluation methods, we compare IndiSocialFT with three popular pretrained embeddings trained over Indian languages. Our findings show that the proposed embedding surpasses the baselines in most cases and languages, demonstrating its suitability for various NLP applications.</abstract>
<identifier type="citekey">kumar-etal-2023-indisocialft</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.252</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.252</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>3866</start>
<end>3871</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IndiSocialFT: Multilingual Word Representation for Indian languages in code-mixed environment
%A Kumar, Saurabh
%A Sanasam, Ranbir
%A Nandi, Sukumar
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F kumar-etal-2023-indisocialft
%X The increasing number of Indian language users on the internet necessitates the development of Indian language technologies. In response to this demand, our paper presents a generalized representation vector for diverse text characteristics, including native scripts, transliterated text, multilingual, code-mixed, and social media-related attributes. We gather text from both social media and well-formed sources and utilize the FastText model to create the “IndiSocialFT” embedding. Through intrinsic and extrinsic evaluation methods, we compare IndiSocialFT with three popular pretrained embeddings trained over Indian languages. Our findings show that the proposed embedding surpasses the baselines in most cases and languages, demonstrating its suitability for various NLP applications.
%R 10.18653/v1/2023.findings-emnlp.252
%U https://aclanthology.org/2023.findings-emnlp.252
%U https://doi.org/10.18653/v1/2023.findings-emnlp.252
%P 3866-3871
Markdown (Informal)
[IndiSocialFT: Multilingual Word Representation for Indian languages in code-mixed environment](https://aclanthology.org/2023.findings-emnlp.252) (Kumar et al., Findings 2023)
ACL