@inproceedings{blouin-etal-2023-unlocking,
title = "Unlocking Transitional {C}hinese: Word Segmentation in Modern Historical Texts",
author = "Blouin, Baptiste and
Huang, Hen-Hsen and
Henriot, Christian and
Armand, C{\'e}cile",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Pirinen, Flammie and
Alnajjar, Khalid and
Miyagawa, So and
Bizzoni, Yuri and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages",
month = dec,
year = "2023",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.nlp4dh-1.11",
pages = "92--101",
abstract = "This research addresses Natural Language Processing (NLP) tokenization challenges for transitional Chinese, which lacks adequate digital resources. The project used a collection of articles from the Shenbao, a newspaper from this period, as their study base. They designed models tailored to transitional Chinese, with goals like historical information extraction, large-scale textual analysis, and creating new datasets for computational linguists. The team manually tokenized historical articles to understand the language{'}s linguistic patterns, syntactic structures, and lexical variations. They developed a custom model tailored to their dataset after evaluating various word segmentation tools. They also studied the impact of using pre-trained language models on historical data. The results showed that using language models aligned with the source languages resulted in superior performance. They assert that transitional Chinese they are processing is more related to ancient Chinese than contemporary Chinese, necessitating the training of language models specifically on their data. The study{'}s outcome is a model that achieves a performance of over 83{\%} and an F-score that is 35{\%} higher than using existing tokenization tools, signifying a substantial improvement. The availability of this new annotated dataset paves the way for refining the model{'}s performance in processing this type of data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blouin-etal-2023-unlocking">
<titleInfo>
<title>Unlocking Transitional Chinese: Word Segmentation in Modern Historical Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baptiste</namePart>
<namePart type="family">Blouin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hen-Hsen</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Henriot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cécile</namePart>
<namePart type="family">Armand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flammie</namePart>
<namePart type="family">Pirinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This research addresses Natural Language Processing (NLP) tokenization challenges for transitional Chinese, which lacks adequate digital resources. The project used a collection of articles from the Shenbao, a newspaper from this period, as their study base. They designed models tailored to transitional Chinese, with goals like historical information extraction, large-scale textual analysis, and creating new datasets for computational linguists. The team manually tokenized historical articles to understand the language’s linguistic patterns, syntactic structures, and lexical variations. They developed a custom model tailored to their dataset after evaluating various word segmentation tools. They also studied the impact of using pre-trained language models on historical data. The results showed that using language models aligned with the source languages resulted in superior performance. They assert that transitional Chinese they are processing is more related to ancient Chinese than contemporary Chinese, necessitating the training of language models specifically on their data. The study’s outcome is a model that achieves a performance of over 83% and an F-score that is 35% higher than using existing tokenization tools, signifying a substantial improvement. The availability of this new annotated dataset paves the way for refining the model’s performance in processing this type of data.</abstract>
<identifier type="citekey">blouin-etal-2023-unlocking</identifier>
<location>
<url>https://aclanthology.org/2023.nlp4dh-1.11</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>92</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unlocking Transitional Chinese: Word Segmentation in Modern Historical Texts
%A Blouin, Baptiste
%A Huang, Hen-Hsen
%A Henriot, Christian
%A Armand, Cécile
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Pirinen, Flammie
%Y Alnajjar, Khalid
%Y Miyagawa, So
%Y Bizzoni, Yuri
%Y Partanen, Niko
%Y Rueter, Jack
%S Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages
%D 2023
%8 December
%I Association for Computational Linguistics
%C Tokyo, Japan
%F blouin-etal-2023-unlocking
%X This research addresses Natural Language Processing (NLP) tokenization challenges for transitional Chinese, which lacks adequate digital resources. The project used a collection of articles from the Shenbao, a newspaper from this period, as their study base. They designed models tailored to transitional Chinese, with goals like historical information extraction, large-scale textual analysis, and creating new datasets for computational linguists. The team manually tokenized historical articles to understand the language’s linguistic patterns, syntactic structures, and lexical variations. They developed a custom model tailored to their dataset after evaluating various word segmentation tools. They also studied the impact of using pre-trained language models on historical data. The results showed that using language models aligned with the source languages resulted in superior performance. They assert that transitional Chinese they are processing is more related to ancient Chinese than contemporary Chinese, necessitating the training of language models specifically on their data. The study’s outcome is a model that achieves a performance of over 83% and an F-score that is 35% higher than using existing tokenization tools, signifying a substantial improvement. The availability of this new annotated dataset paves the way for refining the model’s performance in processing this type of data.
%U https://aclanthology.org/2023.nlp4dh-1.11
%P 92-101
Markdown (Informal)
[Unlocking Transitional Chinese: Word Segmentation in Modern Historical Texts](https://aclanthology.org/2023.nlp4dh-1.11) (Blouin et al., NLP4DH-IWCLUL 2023)
ACL
- Baptiste Blouin, Hen-Hsen Huang, Christian Henriot, and Cécile Armand. 2023. Unlocking Transitional Chinese: Word Segmentation in Modern Historical Texts. In Proceedings of the Joint 3rd International Conference on Natural Language Processing for Digital Humanities and 8th International Workshop on Computational Linguistics for Uralic Languages, pages 92–101, Tokyo, Japan. Association for Computational Linguistics.