@inproceedings{kim-etal-2022-break,
title = "Break it Down into {BTS}: Basic, Tiniest Subword Units for {K}orean",
author = "Kim, Nayeon and
Park, Jun-Hyung and
Choi, Joon-Young and
Jeon, Eojin and
Kang, Youjin and
Lee, SangKeun",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.472",
doi = "10.18653/v1/2022.emnlp-main.472",
pages = "7007--7024",
abstract = "We introduce Basic, Tiniest Subword (BTS) units for the Korean language, which are inspired by the invention principle of Hangeul, the Korean writing system. Instead of relying on 51 Korean consonant and vowel letters, we form the letters from BTS units by adding strokes or combining them. To examine the impact of BTS units on Korean language processing, we develop a novel BTS-based word embedding framework that is readily applicable to various models. Our experiments reveal that BTS units significantly improve the performance of Korean word embedding on all intrinsic and extrinsic tasks in our evaluation. In particular, BTS-based word embedding outperforms the state-of-theart Korean word embedding by 11.8{\%} in word analogy. We further investigate the unique advantages provided by BTS units through indepth analysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2022-break">
<titleInfo>
<title>Break it Down into BTS: Basic, Tiniest Subword Units for Korean</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nayeon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun-Hyung</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joon-Young</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eojin</namePart>
<namePart type="family">Jeon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youjin</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">SangKeun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce Basic, Tiniest Subword (BTS) units for the Korean language, which are inspired by the invention principle of Hangeul, the Korean writing system. Instead of relying on 51 Korean consonant and vowel letters, we form the letters from BTS units by adding strokes or combining them. To examine the impact of BTS units on Korean language processing, we develop a novel BTS-based word embedding framework that is readily applicable to various models. Our experiments reveal that BTS units significantly improve the performance of Korean word embedding on all intrinsic and extrinsic tasks in our evaluation. In particular, BTS-based word embedding outperforms the state-of-theart Korean word embedding by 11.8% in word analogy. We further investigate the unique advantages provided by BTS units through indepth analysis.</abstract>
<identifier type="citekey">kim-etal-2022-break</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.472</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.472</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>7007</start>
<end>7024</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Break it Down into BTS: Basic, Tiniest Subword Units for Korean
%A Kim, Nayeon
%A Park, Jun-Hyung
%A Choi, Joon-Young
%A Jeon, Eojin
%A Kang, Youjin
%A Lee, SangKeun
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F kim-etal-2022-break
%X We introduce Basic, Tiniest Subword (BTS) units for the Korean language, which are inspired by the invention principle of Hangeul, the Korean writing system. Instead of relying on 51 Korean consonant and vowel letters, we form the letters from BTS units by adding strokes or combining them. To examine the impact of BTS units on Korean language processing, we develop a novel BTS-based word embedding framework that is readily applicable to various models. Our experiments reveal that BTS units significantly improve the performance of Korean word embedding on all intrinsic and extrinsic tasks in our evaluation. In particular, BTS-based word embedding outperforms the state-of-theart Korean word embedding by 11.8% in word analogy. We further investigate the unique advantages provided by BTS units through indepth analysis.
%R 10.18653/v1/2022.emnlp-main.472
%U https://aclanthology.org/2022.emnlp-main.472
%U https://doi.org/10.18653/v1/2022.emnlp-main.472
%P 7007-7024
Markdown (Informal)
[Break it Down into BTS: Basic, Tiniest Subword Units for Korean](https://aclanthology.org/2022.emnlp-main.472) (Kim et al., EMNLP 2022)
ACL
- Nayeon Kim, Jun-Hyung Park, Joon-Young Choi, Eojin Jeon, Youjin Kang, and SangKeun Lee. 2022. Break it Down into BTS: Basic, Tiniest Subword Units for Korean. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pages 7007–7024, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.