@inproceedings{zhang-etal-2024-chinese,
title = "{C}hinese Sequence Labeling with Semi-Supervised Boundary-Aware Language Model Pre-training",
author = "Zhang, Longhui and
Long, Dingkun and
Zhang, Meishan and
Zhang, Yanzhao and
Xie, Pengjun and
Zhang, Min",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.282",
pages = "3179--3191",
abstract = "Chinese sequence labeling tasks are sensitive to word boundaries. Although pretrained language models (PLM) have achieved considerable success in these tasks, current PLMs rarely consider boundary information explicitly. An exception to this is BABERT, which incorporates unsupervised statistical boundary information into Chinese BERT{'}s pre-training objectives. Building upon this approach, we input supervised high-quality boundary information to enhance BABERT{'}s learning, developing a semi-supervised boundary-aware PLM. To assess PLMs{'} ability to encode boundaries, we introduce a novel {``}Boundary Information Metric{''} that is both simple and effective. This metric allows comparison of different PLMs without task-specific fine-tuning. Experimental results on Chinese sequence labeling datasets demonstrate that the improved BABERT version outperforms the vanilla version, not only in these tasks but also in broader Chinese natural language understanding tasks. Additionally, our proposed metric offers a convenient and accurate means of evaluating PLMs{'} boundary awareness.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-chinese">
<titleInfo>
<title>Chinese Sequence Labeling with Semi-Supervised Boundary-Aware Language Model Pre-training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Longhui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dingkun</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meishan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanzhao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengjun</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Chinese sequence labeling tasks are sensitive to word boundaries. Although pretrained language models (PLM) have achieved considerable success in these tasks, current PLMs rarely consider boundary information explicitly. An exception to this is BABERT, which incorporates unsupervised statistical boundary information into Chinese BERT’s pre-training objectives. Building upon this approach, we input supervised high-quality boundary information to enhance BABERT’s learning, developing a semi-supervised boundary-aware PLM. To assess PLMs’ ability to encode boundaries, we introduce a novel “Boundary Information Metric” that is both simple and effective. This metric allows comparison of different PLMs without task-specific fine-tuning. Experimental results on Chinese sequence labeling datasets demonstrate that the improved BABERT version outperforms the vanilla version, not only in these tasks but also in broader Chinese natural language understanding tasks. Additionally, our proposed metric offers a convenient and accurate means of evaluating PLMs’ boundary awareness.</abstract>
<identifier type="citekey">zhang-etal-2024-chinese</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.282</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>3179</start>
<end>3191</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Chinese Sequence Labeling with Semi-Supervised Boundary-Aware Language Model Pre-training
%A Zhang, Longhui
%A Long, Dingkun
%A Zhang, Meishan
%A Zhang, Yanzhao
%A Xie, Pengjun
%A Zhang, Min
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F zhang-etal-2024-chinese
%X Chinese sequence labeling tasks are sensitive to word boundaries. Although pretrained language models (PLM) have achieved considerable success in these tasks, current PLMs rarely consider boundary information explicitly. An exception to this is BABERT, which incorporates unsupervised statistical boundary information into Chinese BERT’s pre-training objectives. Building upon this approach, we input supervised high-quality boundary information to enhance BABERT’s learning, developing a semi-supervised boundary-aware PLM. To assess PLMs’ ability to encode boundaries, we introduce a novel “Boundary Information Metric” that is both simple and effective. This metric allows comparison of different PLMs without task-specific fine-tuning. Experimental results on Chinese sequence labeling datasets demonstrate that the improved BABERT version outperforms the vanilla version, not only in these tasks but also in broader Chinese natural language understanding tasks. Additionally, our proposed metric offers a convenient and accurate means of evaluating PLMs’ boundary awareness.
%U https://aclanthology.org/2024.lrec-main.282
%P 3179-3191
Markdown (Informal)
[Chinese Sequence Labeling with Semi-Supervised Boundary-Aware Language Model Pre-training](https://aclanthology.org/2024.lrec-main.282) (Zhang et al., LREC-COLING 2024)
ACL