@inproceedings{wang-etal-2024-enhancing-hindi,
title = "Enhancing {H}indi Feature Representation through Fusion of Dual-Script Word Embeddings",
author = "Wang, Lianxi and
Tian, Yujia and
Chen, Zhuowei",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.528",
pages = "5966--5976",
abstract = "Pretrained language models excel in various natural language processing tasks but often neglect the integration of different scripts within a language, constraining their ability to capture richer semantic information, such as in Hindi. In this work, we present a dual-script enhanced feature representation method for Hindi. We combine single-script features from Devanagari and Romanized Hindi Roberta using concatenation, addition, cross-attention, and convolutional networks. The experiment results show that using a dual-script approach significantly improves model performance across various tasks. The addition fusion technique excels in sequence generation tasks, while for text classification, the CNN-based dual-script enhanced representation performs best with longer sentences, and the addition fusion technique is more effective for shorter sequences. Our approach shows significant advantages in multiple natural language processing tasks, providing a new perspective on feature representation for Hindi. Our code has been released on https://github.com/JohnnyChanV/Hindi-Fusion.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2024-enhancing-hindi">
<titleInfo>
<title>Enhancing Hindi Feature Representation through Fusion of Dual-Script Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lianxi</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujia</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhuowei</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pretrained language models excel in various natural language processing tasks but often neglect the integration of different scripts within a language, constraining their ability to capture richer semantic information, such as in Hindi. In this work, we present a dual-script enhanced feature representation method for Hindi. We combine single-script features from Devanagari and Romanized Hindi Roberta using concatenation, addition, cross-attention, and convolutional networks. The experiment results show that using a dual-script approach significantly improves model performance across various tasks. The addition fusion technique excels in sequence generation tasks, while for text classification, the CNN-based dual-script enhanced representation performs best with longer sentences, and the addition fusion technique is more effective for shorter sequences. Our approach shows significant advantages in multiple natural language processing tasks, providing a new perspective on feature representation for Hindi. Our code has been released on https://github.com/JohnnyChanV/Hindi-Fusion.</abstract>
<identifier type="citekey">wang-etal-2024-enhancing-hindi</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.528</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>5966</start>
<end>5976</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Hindi Feature Representation through Fusion of Dual-Script Word Embeddings
%A Wang, Lianxi
%A Tian, Yujia
%A Chen, Zhuowei
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F wang-etal-2024-enhancing-hindi
%X Pretrained language models excel in various natural language processing tasks but often neglect the integration of different scripts within a language, constraining their ability to capture richer semantic information, such as in Hindi. In this work, we present a dual-script enhanced feature representation method for Hindi. We combine single-script features from Devanagari and Romanized Hindi Roberta using concatenation, addition, cross-attention, and convolutional networks. The experiment results show that using a dual-script approach significantly improves model performance across various tasks. The addition fusion technique excels in sequence generation tasks, while for text classification, the CNN-based dual-script enhanced representation performs best with longer sentences, and the addition fusion technique is more effective for shorter sequences. Our approach shows significant advantages in multiple natural language processing tasks, providing a new perspective on feature representation for Hindi. Our code has been released on https://github.com/JohnnyChanV/Hindi-Fusion.
%U https://aclanthology.org/2024.lrec-main.528
%P 5966-5976
Markdown (Informal)
[Enhancing Hindi Feature Representation through Fusion of Dual-Script Word Embeddings](https://aclanthology.org/2024.lrec-main.528) (Wang et al., LREC-COLING 2024)
ACL