@inproceedings{goel-sadat-2025-studying,
title = "Studying the Effect of {H}indi Tokenizer Performance on Downstream Tasks",
author = "Goel, Rashi and
Sadat, Fatiha",
editor = "Weerasinghe, Ruvan and
Anuradha, Isuri and
Sumanathilaka, Deshan",
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.indonlp-1.5/",
pages = "44--49",
abstract = "This paper deals with a study on the effect of training data size and tokenizer performance for Hindi language on the eventual downstream model performance and comprehension. Multiple monolingual Hindi tokenizers are trained for large language models such as BERT and intrinsic and extrinsic evaluations are performed on multiple Hindi datasets. The objective of this study is to understand the precise effects of tokenizer performance on downstream task performance to gain insight on how to develop better models for low-resource languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goel-sadat-2025-studying">
<titleInfo>
<title>Studying the Effect of Hindi Tokenizer Performance on Downstream Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rashi</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fatiha</namePart>
<namePart type="family">Sadat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruvan</namePart>
<namePart type="family">Weerasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isuri</namePart>
<namePart type="family">Anuradha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deshan</namePart>
<namePart type="family">Sumanathilaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper deals with a study on the effect of training data size and tokenizer performance for Hindi language on the eventual downstream model performance and comprehension. Multiple monolingual Hindi tokenizers are trained for large language models such as BERT and intrinsic and extrinsic evaluations are performed on multiple Hindi datasets. The objective of this study is to understand the precise effects of tokenizer performance on downstream task performance to gain insight on how to develop better models for low-resource languages.</abstract>
<identifier type="citekey">goel-sadat-2025-studying</identifier>
<location>
<url>https://aclanthology.org/2025.indonlp-1.5/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>44</start>
<end>49</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Studying the Effect of Hindi Tokenizer Performance on Downstream Tasks
%A Goel, Rashi
%A Sadat, Fatiha
%Y Weerasinghe, Ruvan
%Y Anuradha, Isuri
%Y Sumanathilaka, Deshan
%S Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi
%F goel-sadat-2025-studying
%X This paper deals with a study on the effect of training data size and tokenizer performance for Hindi language on the eventual downstream model performance and comprehension. Multiple monolingual Hindi tokenizers are trained for large language models such as BERT and intrinsic and extrinsic evaluations are performed on multiple Hindi datasets. The objective of this study is to understand the precise effects of tokenizer performance on downstream task performance to gain insight on how to develop better models for low-resource languages.
%U https://aclanthology.org/2025.indonlp-1.5/
%P 44-49
Markdown (Informal)
[Studying the Effect of Hindi Tokenizer Performance on Downstream Tasks](https://aclanthology.org/2025.indonlp-1.5/) (Goel & Sadat, IndoNLP 2025)
ACL