@inproceedings{akdemir-etal-2022-developing,
title = "Developing Language Resources and {NLP} Tools for the {N}orth {K}orean Language",
author = "Akdemir, Arda and
Jeon, Yeojoo and
Shibuya, Tetsuo",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.600",
pages = "5595--5600",
abstract = "Since the division of Korea, the two Korean languages have diverged significantly over the last 70 years. However, due to the lack of linguistic source of the North Korean language, there is no DPRK-based language model. Consequently, scholars rely on the Korean language model by utilizing South Korean linguistic data. In this paper, we first present a large-scale dataset for the North Korean language. We use the dataset to train a BERT-based language model, DPRK-BERT. Second, we annotate a subset of this dataset for the sentiment analysis task. Finally, we compare the performance of different language models for masked language modeling and sentiment analysis tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="akdemir-etal-2022-developing">
<titleInfo>
<title>Developing Language Resources and NLP Tools for the North Korean Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arda</namePart>
<namePart type="family">Akdemir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yeojoo</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tetsuo</namePart>
<namePart type="family">Shibuya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Since the division of Korea, the two Korean languages have diverged significantly over the last 70 years. However, due to the lack of linguistic source of the North Korean language, there is no DPRK-based language model. Consequently, scholars rely on the Korean language model by utilizing South Korean linguistic data. In this paper, we first present a large-scale dataset for the North Korean language. We use the dataset to train a BERT-based language model, DPRK-BERT. Second, we annotate a subset of this dataset for the sentiment analysis task. Finally, we compare the performance of different language models for masked language modeling and sentiment analysis tasks.</abstract>
<identifier type="citekey">akdemir-etal-2022-developing</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.600</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>5595</start>
<end>5600</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Developing Language Resources and NLP Tools for the North Korean Language
%A Akdemir, Arda
%A Jeon, Yeojoo
%A Shibuya, Tetsuo
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F akdemir-etal-2022-developing
%X Since the division of Korea, the two Korean languages have diverged significantly over the last 70 years. However, due to the lack of linguistic source of the North Korean language, there is no DPRK-based language model. Consequently, scholars rely on the Korean language model by utilizing South Korean linguistic data. In this paper, we first present a large-scale dataset for the North Korean language. We use the dataset to train a BERT-based language model, DPRK-BERT. Second, we annotate a subset of this dataset for the sentiment analysis task. Finally, we compare the performance of different language models for masked language modeling and sentiment analysis tasks.
%U https://aclanthology.org/2022.lrec-1.600
%P 5595-5600
Markdown (Informal)
[Developing Language Resources and NLP Tools for the North Korean Language](https://aclanthology.org/2022.lrec-1.600) (Akdemir et al., LREC 2022)
ACL