@inproceedings{wang-li-2024-two,
title = "Two Sequence Labeling Approaches to Sentence Segmentation and Punctuation Prediction for Classic {C}hinese Texts",
author = "Wang, Xuebin and
Li, Zhenghua",
editor = "Sprugnoli, Rachele and
Passarotti, Marco",
booktitle = "Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lt4hala-1.28",
pages = "237--241",
abstract = "This paper describes our system for the EvaHan2024 shared task. We design and experiment with two sequence labeling approaches, i.e., one-stage and two-stage approaches. The one-stage approach directly predicts a label for each character, and the label may contain multiple punctuation marks. The two-stage approach divides punctuation marks into two classes, i.e., pause and non-pause, and separately handles them via two sequence labeling processes. The labels contain at most one punctuation marks. We use pre-trained SikuRoBERTa as a key component of the encoder and employ a conditional random field (CRF) layer on the top. According to the evaluation metrics adopted by the organizers, the two-stage approach is superior to the one-stage approach, and our system achieves the second place among all participant systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-li-2024-two">
<titleInfo>
<title>Two Sequence Labeling Approaches to Sentence Segmentation and Punctuation Prediction for Classic Chinese Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuebin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenghua</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Passarotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our system for the EvaHan2024 shared task. We design and experiment with two sequence labeling approaches, i.e., one-stage and two-stage approaches. The one-stage approach directly predicts a label for each character, and the label may contain multiple punctuation marks. The two-stage approach divides punctuation marks into two classes, i.e., pause and non-pause, and separately handles them via two sequence labeling processes. The labels contain at most one punctuation marks. We use pre-trained SikuRoBERTa as a key component of the encoder and employ a conditional random field (CRF) layer on the top. According to the evaluation metrics adopted by the organizers, the two-stage approach is superior to the one-stage approach, and our system achieves the second place among all participant systems.</abstract>
<identifier type="citekey">wang-li-2024-two</identifier>
<location>
<url>https://aclanthology.org/2024.lt4hala-1.28</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>237</start>
<end>241</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Two Sequence Labeling Approaches to Sentence Segmentation and Punctuation Prediction for Classic Chinese Texts
%A Wang, Xuebin
%A Li, Zhenghua
%Y Sprugnoli, Rachele
%Y Passarotti, Marco
%S Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F wang-li-2024-two
%X This paper describes our system for the EvaHan2024 shared task. We design and experiment with two sequence labeling approaches, i.e., one-stage and two-stage approaches. The one-stage approach directly predicts a label for each character, and the label may contain multiple punctuation marks. The two-stage approach divides punctuation marks into two classes, i.e., pause and non-pause, and separately handles them via two sequence labeling processes. The labels contain at most one punctuation marks. We use pre-trained SikuRoBERTa as a key component of the encoder and employ a conditional random field (CRF) layer on the top. According to the evaluation metrics adopted by the organizers, the two-stage approach is superior to the one-stage approach, and our system achieves the second place among all participant systems.
%U https://aclanthology.org/2024.lt4hala-1.28
%P 237-241
Markdown (Informal)
[Two Sequence Labeling Approaches to Sentence Segmentation and Punctuation Prediction for Classic Chinese Texts](https://aclanthology.org/2024.lt4hala-1.28) (Wang & Li, LT4HALA-WS 2024)
ACL