@inproceedings{jaf-2016-simple,
title = "A Simple Approach to Unifying Ambiguously Encoded {K}urdish Characters",
author = "Jaf, Sardar",
booktitle = "Proceedings of the Second International Conference on Computational Linguistics in Bulgaria (CLIB 2016)",
month = sep,
year = "2016",
address = "Sofia, Bulgaria",
publisher = "Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences",
url = "https://aclanthology.org/2016.clib-1.11",
pages = "86--94",
abstract = "In this study we outline a potential problem in the normalisation stage of processing texts that are based on a modified version of the Arabic alphabet. The main source of resources available for processing resource-scarce languages is raw text. We have identified an interesting challenge that must be addressed when normalising certain natural language texts. Many less-resourced languages, such as Kurdish, Farsi, Urdu, Pashtu, etc., use a modified version of the Arabic writing system. Many characters in harvested data from the Internet may have exactly the same form but encoded with different Unicode values (ambiguous characters). It is important to identify ambiguous characters during the normalisation stage of most text processing tasks. We will demonstrate cases related to ambiguous Kurdish and Farsi characters and propose a semi-automatic approach to identifying and unifying ambiguously encoded characters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jaf-2016-simple">
<titleInfo>
<title>A Simple Approach to Unifying Ambiguously Encoded Kurdish Characters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sardar</namePart>
<namePart type="family">Jaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second International Conference on Computational Linguistics in Bulgaria (CLIB 2016)</title>
</titleInfo>
<originInfo>
<publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
<place>
<placeTerm type="text">Sofia, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this study we outline a potential problem in the normalisation stage of processing texts that are based on a modified version of the Arabic alphabet. The main source of resources available for processing resource-scarce languages is raw text. We have identified an interesting challenge that must be addressed when normalising certain natural language texts. Many less-resourced languages, such as Kurdish, Farsi, Urdu, Pashtu, etc., use a modified version of the Arabic writing system. Many characters in harvested data from the Internet may have exactly the same form but encoded with different Unicode values (ambiguous characters). It is important to identify ambiguous characters during the normalisation stage of most text processing tasks. We will demonstrate cases related to ambiguous Kurdish and Farsi characters and propose a semi-automatic approach to identifying and unifying ambiguously encoded characters.</abstract>
<identifier type="citekey">jaf-2016-simple</identifier>
<location>
<url>https://aclanthology.org/2016.clib-1.11</url>
</location>
<part>
<date>2016-09</date>
<extent unit="page">
<start>86</start>
<end>94</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Simple Approach to Unifying Ambiguously Encoded Kurdish Characters
%A Jaf, Sardar
%S Proceedings of the Second International Conference on Computational Linguistics in Bulgaria (CLIB 2016)
%D 2016
%8 September
%I Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences
%C Sofia, Bulgaria
%F jaf-2016-simple
%X In this study we outline a potential problem in the normalisation stage of processing texts that are based on a modified version of the Arabic alphabet. The main source of resources available for processing resource-scarce languages is raw text. We have identified an interesting challenge that must be addressed when normalising certain natural language texts. Many less-resourced languages, such as Kurdish, Farsi, Urdu, Pashtu, etc., use a modified version of the Arabic writing system. Many characters in harvested data from the Internet may have exactly the same form but encoded with different Unicode values (ambiguous characters). It is important to identify ambiguous characters during the normalisation stage of most text processing tasks. We will demonstrate cases related to ambiguous Kurdish and Farsi characters and propose a semi-automatic approach to identifying and unifying ambiguously encoded characters.
%U https://aclanthology.org/2016.clib-1.11
%P 86-94
Markdown (Informal)
[A Simple Approach to Unifying Ambiguously Encoded Kurdish Characters](https://aclanthology.org/2016.clib-1.11) (Jaf, CLIB 2016)
ACL