@inproceedings{qiu-etal-2024-complex,
title = "{C}omp{L}ex-{ZH}: A New Dataset for Lexical Complexity Prediction in {M}andarin and {C}antonese",
author = "Qiu, Le and
Guo, Shanyue and
Wong, Tak-Sum and
Chersoni, Emmanuele and
Lee, John and
Huang, Chu-Ren",
editor = "Shardlow, Matthew and
Saggion, Horacio and
Alva-Manchego, Fernando and
Zampieri, Marcos and
North, Kai and
{\v{S}}tajner, Sanja and
Stodden, Regina",
booktitle = "Proceedings of the Third Workshop on Text Simplification, Accessibility and Readability (TSAR 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.tsar-1.3",
pages = "20--26",
abstract = "The prediction of lexical complexity in context is assuming an increasing relevance in Natural Language Processing research, since identifying complex words is often the first step of text simplification pipelines. To the best of our knowledge, though, datasets annotated with complex words are available only for English and for a limited number of Western languages.In our paper, we introduce CompLex-ZH, a dataset including words annotated with complexity scores in sentential contexts for Chinese. Our data include sentences in Mandarin and Cantonese, which were selected from a variety of sources and textual genres. We provide a first evaluation with baselines combining hand-crafted and language models-based features.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qiu-etal-2024-complex">
<titleInfo>
<title>CompLex-ZH: A New Dataset for Lexical Complexity Prediction in Mandarin and Cantonese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Le</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shanyue</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tak-Sum</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuele</namePart>
<namePart type="family">Chersoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Text Simplification, Accessibility and Readability (TSAR 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Shardlow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Horacio</namePart>
<namePart type="family">Saggion</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Alva-Manchego</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanja</namePart>
<namePart type="family">Štajner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Regina</namePart>
<namePart type="family">Stodden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The prediction of lexical complexity in context is assuming an increasing relevance in Natural Language Processing research, since identifying complex words is often the first step of text simplification pipelines. To the best of our knowledge, though, datasets annotated with complex words are available only for English and for a limited number of Western languages.In our paper, we introduce CompLex-ZH, a dataset including words annotated with complexity scores in sentential contexts for Chinese. Our data include sentences in Mandarin and Cantonese, which were selected from a variety of sources and textual genres. We provide a first evaluation with baselines combining hand-crafted and language models-based features.</abstract>
<identifier type="citekey">qiu-etal-2024-complex</identifier>
<location>
<url>https://aclanthology.org/2024.tsar-1.3</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>20</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CompLex-ZH: A New Dataset for Lexical Complexity Prediction in Mandarin and Cantonese
%A Qiu, Le
%A Guo, Shanyue
%A Wong, Tak-Sum
%A Chersoni, Emmanuele
%A Lee, John
%A Huang, Chu-Ren
%Y Shardlow, Matthew
%Y Saggion, Horacio
%Y Alva-Manchego, Fernando
%Y Zampieri, Marcos
%Y North, Kai
%Y Štajner, Sanja
%Y Stodden, Regina
%S Proceedings of the Third Workshop on Text Simplification, Accessibility and Readability (TSAR 2024)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F qiu-etal-2024-complex
%X The prediction of lexical complexity in context is assuming an increasing relevance in Natural Language Processing research, since identifying complex words is often the first step of text simplification pipelines. To the best of our knowledge, though, datasets annotated with complex words are available only for English and for a limited number of Western languages.In our paper, we introduce CompLex-ZH, a dataset including words annotated with complexity scores in sentential contexts for Chinese. Our data include sentences in Mandarin and Cantonese, which were selected from a variety of sources and textual genres. We provide a first evaluation with baselines combining hand-crafted and language models-based features.
%U https://aclanthology.org/2024.tsar-1.3
%P 20-26
Markdown (Informal)
[CompLex-ZH: A New Dataset for Lexical Complexity Prediction in Mandarin and Cantonese](https://aclanthology.org/2024.tsar-1.3) (Qiu et al., TSAR 2024)
ACL
- Le Qiu, Shanyue Guo, Tak-Sum Wong, Emmanuele Chersoni, John Lee, and Chu-Ren Huang. 2024. CompLex-ZH: A New Dataset for Lexical Complexity Prediction in Mandarin and Cantonese. In Proceedings of the Third Workshop on Text Simplification, Accessibility and Readability (TSAR 2024), pages 20–26, Miami, Florida, USA. Association for Computational Linguistics.