@inproceedings{nielsen-etal-2023-distinguishing,
title = "Distinguishing {R}omanized {H}indi from {R}omanized {U}rdu",
author = "Nielsen, Elizabeth and
Kirov, Christo and
Roark, Brian",
editor = "Gorman, Kyle and
Sproat, Richard and
Roark, Brian",
booktitle = "Proceedings of the Workshop on Computation and Written Language (CAWL 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.cawl-1.5",
doi = "10.18653/v1/2023.cawl-1.5",
pages = "33--42",
abstract = "We examine the task of distinguishing between Hindi and Urdu when those languages are romanized, i.e., written in the Latin script. Both languages are widely informally romanized, and to the extent that they are identified in the Latin script by language identification systems, they are typically conflated. In the absence of large labeled collections of such text, we consider methods for generating training data. Beginning with a small set of seed words, each of which are strongly indicative of one of the languages versus the other, we prompt a pretrained large language model (LLM) to generate romanized text. Treating text generated from an Urdu prompt as one class and text generated from a Hindi prompt as the other class, we build a binary language identification (LangID) classifier. We demonstrate that the resulting classifier distinguishes manually romanized Urdu Wikipedia text from manually romanized Hindi Wikipedia text far better than chance. We use this classifier to estimate the prevalence of Urdu in a large collection of text labeled as romanized Hindi that has been used to train large language models. These techniques can be applied to bootstrap classifiers in other cases where a dataset is known to contain multiple distinct but related classes, such as different dialects of the same language, but for which labels cannot easily be obtained.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nielsen-etal-2023-distinguishing">
<titleInfo>
<title>Distinguishing Romanized Hindi from Romanized Urdu</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Nielsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christo</namePart>
<namePart type="family">Kirov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computation and Written Language (CAWL 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sproat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We examine the task of distinguishing between Hindi and Urdu when those languages are romanized, i.e., written in the Latin script. Both languages are widely informally romanized, and to the extent that they are identified in the Latin script by language identification systems, they are typically conflated. In the absence of large labeled collections of such text, we consider methods for generating training data. Beginning with a small set of seed words, each of which are strongly indicative of one of the languages versus the other, we prompt a pretrained large language model (LLM) to generate romanized text. Treating text generated from an Urdu prompt as one class and text generated from a Hindi prompt as the other class, we build a binary language identification (LangID) classifier. We demonstrate that the resulting classifier distinguishes manually romanized Urdu Wikipedia text from manually romanized Hindi Wikipedia text far better than chance. We use this classifier to estimate the prevalence of Urdu in a large collection of text labeled as romanized Hindi that has been used to train large language models. These techniques can be applied to bootstrap classifiers in other cases where a dataset is known to contain multiple distinct but related classes, such as different dialects of the same language, but for which labels cannot easily be obtained.</abstract>
<identifier type="citekey">nielsen-etal-2023-distinguishing</identifier>
<identifier type="doi">10.18653/v1/2023.cawl-1.5</identifier>
<location>
<url>https://aclanthology.org/2023.cawl-1.5</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>33</start>
<end>42</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Distinguishing Romanized Hindi from Romanized Urdu
%A Nielsen, Elizabeth
%A Kirov, Christo
%A Roark, Brian
%Y Gorman, Kyle
%Y Sproat, Richard
%Y Roark, Brian
%S Proceedings of the Workshop on Computation and Written Language (CAWL 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F nielsen-etal-2023-distinguishing
%X We examine the task of distinguishing between Hindi and Urdu when those languages are romanized, i.e., written in the Latin script. Both languages are widely informally romanized, and to the extent that they are identified in the Latin script by language identification systems, they are typically conflated. In the absence of large labeled collections of such text, we consider methods for generating training data. Beginning with a small set of seed words, each of which are strongly indicative of one of the languages versus the other, we prompt a pretrained large language model (LLM) to generate romanized text. Treating text generated from an Urdu prompt as one class and text generated from a Hindi prompt as the other class, we build a binary language identification (LangID) classifier. We demonstrate that the resulting classifier distinguishes manually romanized Urdu Wikipedia text from manually romanized Hindi Wikipedia text far better than chance. We use this classifier to estimate the prevalence of Urdu in a large collection of text labeled as romanized Hindi that has been used to train large language models. These techniques can be applied to bootstrap classifiers in other cases where a dataset is known to contain multiple distinct but related classes, such as different dialects of the same language, but for which labels cannot easily be obtained.
%R 10.18653/v1/2023.cawl-1.5
%U https://aclanthology.org/2023.cawl-1.5
%U https://doi.org/10.18653/v1/2023.cawl-1.5
%P 33-42
Markdown (Informal)
[Distinguishing Romanized Hindi from Romanized Urdu](https://aclanthology.org/2023.cawl-1.5) (Nielsen et al., CAWL 2023)
ACL
- Elizabeth Nielsen, Christo Kirov, and Brian Roark. 2023. Distinguishing Romanized Hindi from Romanized Urdu. In Proceedings of the Workshop on Computation and Written Language (CAWL 2023), pages 33–42, Toronto, Canada. Association for Computational Linguistics.