@inproceedings{farhan-etal-2020-enhanced,
title = "Enhanced {U}rdu Word Segmentation using Conditional Random Fields and Morphological Context Features",
author = "Farhan, Aamir and
Islam, Mashrukh and
Sharma, Dipti Misra",
editor = "Cunha, Rossana and
Shaikh, Samira and
Varis, Erika and
Georgi, Ryan and
Tsai, Alicia and
Anastasopoulos, Antonios and
Chandu, Khyathi Raghavi",
booktitle = "Proceedings of the Fourth Widening Natural Language Processing Workshop",
month = jul,
year = "2020",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.winlp-1.41",
doi = "10.18653/v1/2020.winlp-1.41",
pages = "156--159",
abstract = "Word segmentation is a fundamental task for most of the NLP applications. Urdu adopts Nastalique writing style which does not have a concept of space. Furthermore, the inherent non-joining attributes of certain characters in Urdu create spaces within a word while writing in digital format. Thus, Urdu not only has space omission but also space insertion issues which make the word segmentation task challenging. In this paper, we improve upon the results of Zia, Raza and Athar (2018) by using a manually annotated corpus of 19,651 sentences along with morphological context features. Using the Conditional Random Field sequence modeler, our model achieves F 1 score of 0.98 for word boundary identification and 0.92 for sub-word boundary identification tasks. The results demonstrated in this paper outperform the state-of-the-art methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="farhan-etal-2020-enhanced">
<titleInfo>
<title>Enhanced Urdu Word Segmentation using Conditional Random Fields and Morphological Context Features</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aamir</namePart>
<namePart type="family">Farhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mashrukh</namePart>
<namePart type="family">Islam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dipti</namePart>
<namePart type="given">Misra</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Widening Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rossana</namePart>
<namePart type="family">Cunha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samira</namePart>
<namePart type="family">Shaikh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Varis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Georgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="given">Raghavi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Word segmentation is a fundamental task for most of the NLP applications. Urdu adopts Nastalique writing style which does not have a concept of space. Furthermore, the inherent non-joining attributes of certain characters in Urdu create spaces within a word while writing in digital format. Thus, Urdu not only has space omission but also space insertion issues which make the word segmentation task challenging. In this paper, we improve upon the results of Zia, Raza and Athar (2018) by using a manually annotated corpus of 19,651 sentences along with morphological context features. Using the Conditional Random Field sequence modeler, our model achieves F 1 score of 0.98 for word boundary identification and 0.92 for sub-word boundary identification tasks. The results demonstrated in this paper outperform the state-of-the-art methods.</abstract>
<identifier type="citekey">farhan-etal-2020-enhanced</identifier>
<identifier type="doi">10.18653/v1/2020.winlp-1.41</identifier>
<location>
<url>https://aclanthology.org/2020.winlp-1.41</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>156</start>
<end>159</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhanced Urdu Word Segmentation using Conditional Random Fields and Morphological Context Features
%A Farhan, Aamir
%A Islam, Mashrukh
%A Sharma, Dipti Misra
%Y Cunha, Rossana
%Y Shaikh, Samira
%Y Varis, Erika
%Y Georgi, Ryan
%Y Tsai, Alicia
%Y Anastasopoulos, Antonios
%Y Chandu, Khyathi Raghavi
%S Proceedings of the Fourth Widening Natural Language Processing Workshop
%D 2020
%8 July
%I Association for Computational Linguistics
%C Seattle, USA
%F farhan-etal-2020-enhanced
%X Word segmentation is a fundamental task for most of the NLP applications. Urdu adopts Nastalique writing style which does not have a concept of space. Furthermore, the inherent non-joining attributes of certain characters in Urdu create spaces within a word while writing in digital format. Thus, Urdu not only has space omission but also space insertion issues which make the word segmentation task challenging. In this paper, we improve upon the results of Zia, Raza and Athar (2018) by using a manually annotated corpus of 19,651 sentences along with morphological context features. Using the Conditional Random Field sequence modeler, our model achieves F 1 score of 0.98 for word boundary identification and 0.92 for sub-word boundary identification tasks. The results demonstrated in this paper outperform the state-of-the-art methods.
%R 10.18653/v1/2020.winlp-1.41
%U https://aclanthology.org/2020.winlp-1.41
%U https://doi.org/10.18653/v1/2020.winlp-1.41
%P 156-159
Markdown (Informal)
[Enhanced Urdu Word Segmentation using Conditional Random Fields and Morphological Context Features](https://aclanthology.org/2020.winlp-1.41) (Farhan et al., WiNLP 2020)
ACL