@inproceedings{nguyen-kauchak-2022-complex,
title = "Complex Word Identification in {V}ietnamese: Towards {V}ietnamese Text Simplification",
author = "Nguyen, Phuong and
Kauchak, David",
editor = "Asai, Akari and
Choi, Eunsol and
Clark, Jonathan H. and
Hu, Junjie and
Lee, Chia-Hsuan and
Kasai, Jungo and
Longpre, Shayne and
Yamada, Ikuya and
Zhang, Rui",
booktitle = "Proceedings of the Workshop on Multilingual Information Access (MIA)",
month = jul,
year = "2022",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.mia-1.6",
doi = "10.18653/v1/2022.mia-1.6",
pages = "59--68",
abstract = "Text Simplification has been an extensively researched problem in English, but has not been investigated in Vietnamese. We focus on the Vietnamese-specific Complex Word Identification task, often the first step in Lexical Simplification (Shardlow, 2013). We examine three different Vietnamese datasets constructed for other Natural Language Processing tasks and show that, like in other languages, frequency is a strong signal in determining whether a word is complex, with a mean accuracy of 86.87{\%}. Across the datasets, we find that the 10{\%} most frequent words in many corpus can be labelled as simple, and the rest as complex, though this is more variable for smaller corpora. We also examine how human annotators perform at this task. Given the subjective nature, there is a fair amount of variability in which words are seen as difficult, though majority results are more consistent.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-kauchak-2022-complex">
<titleInfo>
<title>Complex Word Identification in Vietnamese: Towards Vietnamese Text Simplification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Phuong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Kauchak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Multilingual Information Access (MIA)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Akari</namePart>
<namePart type="family">Asai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunsol</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="given">H</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junjie</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chia-Hsuan</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungo</namePart>
<namePart type="family">Kasai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shayne</namePart>
<namePart type="family">Longpre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ikuya</namePart>
<namePart type="family">Yamada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text Simplification has been an extensively researched problem in English, but has not been investigated in Vietnamese. We focus on the Vietnamese-specific Complex Word Identification task, often the first step in Lexical Simplification (Shardlow, 2013). We examine three different Vietnamese datasets constructed for other Natural Language Processing tasks and show that, like in other languages, frequency is a strong signal in determining whether a word is complex, with a mean accuracy of 86.87%. Across the datasets, we find that the 10% most frequent words in many corpus can be labelled as simple, and the rest as complex, though this is more variable for smaller corpora. We also examine how human annotators perform at this task. Given the subjective nature, there is a fair amount of variability in which words are seen as difficult, though majority results are more consistent.</abstract>
<identifier type="citekey">nguyen-kauchak-2022-complex</identifier>
<identifier type="doi">10.18653/v1/2022.mia-1.6</identifier>
<location>
<url>https://aclanthology.org/2022.mia-1.6</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>59</start>
<end>68</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Complex Word Identification in Vietnamese: Towards Vietnamese Text Simplification
%A Nguyen, Phuong
%A Kauchak, David
%Y Asai, Akari
%Y Choi, Eunsol
%Y Clark, Jonathan H.
%Y Hu, Junjie
%Y Lee, Chia-Hsuan
%Y Kasai, Jungo
%Y Longpre, Shayne
%Y Yamada, Ikuya
%Y Zhang, Rui
%S Proceedings of the Workshop on Multilingual Information Access (MIA)
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, USA
%F nguyen-kauchak-2022-complex
%X Text Simplification has been an extensively researched problem in English, but has not been investigated in Vietnamese. We focus on the Vietnamese-specific Complex Word Identification task, often the first step in Lexical Simplification (Shardlow, 2013). We examine three different Vietnamese datasets constructed for other Natural Language Processing tasks and show that, like in other languages, frequency is a strong signal in determining whether a word is complex, with a mean accuracy of 86.87%. Across the datasets, we find that the 10% most frequent words in many corpus can be labelled as simple, and the rest as complex, though this is more variable for smaller corpora. We also examine how human annotators perform at this task. Given the subjective nature, there is a fair amount of variability in which words are seen as difficult, though majority results are more consistent.
%R 10.18653/v1/2022.mia-1.6
%U https://aclanthology.org/2022.mia-1.6
%U https://doi.org/10.18653/v1/2022.mia-1.6
%P 59-68
Markdown (Informal)
[Complex Word Identification in Vietnamese: Towards Vietnamese Text Simplification](https://aclanthology.org/2022.mia-1.6) (Nguyen & Kauchak, MIA 2022)
ACL