@inproceedings{khairunnisa-etal-2020-towards,
title = "Towards a Standardized Dataset on {I}ndonesian Named Entity Recognition",
author = "Khairunnisa, Siti Oryza and
Imankulova, Aizhan and
Komachi, Mamoru",
editor = "Shmueli, Boaz and
Huang, Yin Jou",
booktitle = "Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: Student Research Workshop",
month = dec,
year = "2020",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.aacl-srw.10",
doi = "10.18653/v1/2020.aacl-srw.10",
pages = "64--71",
abstract = "In recent years, named entity recognition (NER) tasks in the Indonesian language have undergone extensive development. There are only a few corpora for Indonesian NER; hence, recent Indonesian NER studies have used diverse datasets. Although an open dataset is available, it includes only approximately 2,000 sentences and contains inconsistent annotations, thereby preventing accurate training of NER models without reliance on pre-trained models. Therefore, we re-annotated the dataset and compared the two annotations{'} performance using the Bidirectional Long Short-Term Memory and Conditional Random Field (BiLSTM-CRF) approach. Fixing the annotation yielded a more consistent result for the organization tag and improved the prediction score by a large margin. Moreover, to take full advantage of pre-trained models, we compared different feature embeddings to determine their impact on the NER task for the Indonesian language.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khairunnisa-etal-2020-towards">
<titleInfo>
<title>Towards a Standardized Dataset on Indonesian Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siti</namePart>
<namePart type="given">Oryza</namePart>
<namePart type="family">Khairunnisa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aizhan</namePart>
<namePart type="family">Imankulova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Boaz</namePart>
<namePart type="family">Shmueli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yin</namePart>
<namePart type="given">Jou</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, named entity recognition (NER) tasks in the Indonesian language have undergone extensive development. There are only a few corpora for Indonesian NER; hence, recent Indonesian NER studies have used diverse datasets. Although an open dataset is available, it includes only approximately 2,000 sentences and contains inconsistent annotations, thereby preventing accurate training of NER models without reliance on pre-trained models. Therefore, we re-annotated the dataset and compared the two annotations’ performance using the Bidirectional Long Short-Term Memory and Conditional Random Field (BiLSTM-CRF) approach. Fixing the annotation yielded a more consistent result for the organization tag and improved the prediction score by a large margin. Moreover, to take full advantage of pre-trained models, we compared different feature embeddings to determine their impact on the NER task for the Indonesian language.</abstract>
<identifier type="citekey">khairunnisa-etal-2020-towards</identifier>
<identifier type="doi">10.18653/v1/2020.aacl-srw.10</identifier>
<location>
<url>https://aclanthology.org/2020.aacl-srw.10</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>64</start>
<end>71</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards a Standardized Dataset on Indonesian Named Entity Recognition
%A Khairunnisa, Siti Oryza
%A Imankulova, Aizhan
%A Komachi, Mamoru
%Y Shmueli, Boaz
%Y Huang, Yin Jou
%S Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: Student Research Workshop
%D 2020
%8 December
%I Association for Computational Linguistics
%C Suzhou, China
%F khairunnisa-etal-2020-towards
%X In recent years, named entity recognition (NER) tasks in the Indonesian language have undergone extensive development. There are only a few corpora for Indonesian NER; hence, recent Indonesian NER studies have used diverse datasets. Although an open dataset is available, it includes only approximately 2,000 sentences and contains inconsistent annotations, thereby preventing accurate training of NER models without reliance on pre-trained models. Therefore, we re-annotated the dataset and compared the two annotations’ performance using the Bidirectional Long Short-Term Memory and Conditional Random Field (BiLSTM-CRF) approach. Fixing the annotation yielded a more consistent result for the organization tag and improved the prediction score by a large margin. Moreover, to take full advantage of pre-trained models, we compared different feature embeddings to determine their impact on the NER task for the Indonesian language.
%R 10.18653/v1/2020.aacl-srw.10
%U https://aclanthology.org/2020.aacl-srw.10
%U https://doi.org/10.18653/v1/2020.aacl-srw.10
%P 64-71
Markdown (Informal)
[Towards a Standardized Dataset on Indonesian Named Entity Recognition](https://aclanthology.org/2020.aacl-srw.10) (Khairunnisa et al., AACL 2020)
ACL
- Siti Oryza Khairunnisa, Aizhan Imankulova, and Mamoru Komachi. 2020. Towards a Standardized Dataset on Indonesian Named Entity Recognition. In Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: Student Research Workshop, pages 64–71, Suzhou, China. Association for Computational Linguistics.