@inproceedings{murawaki-mori-2016-wikification,
title = "Wikification for Scriptio Continua",
author = "Murawaki, Yugo and
Mori, Shinsuke",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1214",
pages = "1346--1351",
abstract = "The fact that Japanese employs scriptio continua, or a writing system without spaces, complicates the first step of an NLP pipeline. Word segmentation is widely used in Japanese language processing, and lexical knowledge is crucial for reliable identification of words in text. Although external lexical resources like Wikipedia are potentially useful, segmentation mismatch prevents them from being straightforwardly incorporated into the word segmentation task. If we intentionally violate segmentation standards with the direct incorporation, quantitative evaluation will be no longer feasible. To address this problem, we propose to define a separate task that directly links given texts to an external resource, that is, wikification in the case of Wikipedia. By doing so, we can circumvent segmentation mismatch that may not necessarily be important for downstream applications. As the first step to realize the idea, we design the task of Japanese wikification and construct wikification corpora. We annotated subsets of the Balanced Corpus of Contemporary Written Japanese plus Twitter short messages. We also implement a simple wikifier and investigate its performance on these corpora.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="murawaki-mori-2016-wikification">
<titleInfo>
<title>Wikification for Scriptio Continua</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yugo</namePart>
<namePart type="family">Murawaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinsuke</namePart>
<namePart type="family">Mori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The fact that Japanese employs scriptio continua, or a writing system without spaces, complicates the first step of an NLP pipeline. Word segmentation is widely used in Japanese language processing, and lexical knowledge is crucial for reliable identification of words in text. Although external lexical resources like Wikipedia are potentially useful, segmentation mismatch prevents them from being straightforwardly incorporated into the word segmentation task. If we intentionally violate segmentation standards with the direct incorporation, quantitative evaluation will be no longer feasible. To address this problem, we propose to define a separate task that directly links given texts to an external resource, that is, wikification in the case of Wikipedia. By doing so, we can circumvent segmentation mismatch that may not necessarily be important for downstream applications. As the first step to realize the idea, we design the task of Japanese wikification and construct wikification corpora. We annotated subsets of the Balanced Corpus of Contemporary Written Japanese plus Twitter short messages. We also implement a simple wikifier and investigate its performance on these corpora.</abstract>
<identifier type="citekey">murawaki-mori-2016-wikification</identifier>
<location>
<url>https://aclanthology.org/L16-1214</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>1346</start>
<end>1351</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Wikification for Scriptio Continua
%A Murawaki, Yugo
%A Mori, Shinsuke
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F murawaki-mori-2016-wikification
%X The fact that Japanese employs scriptio continua, or a writing system without spaces, complicates the first step of an NLP pipeline. Word segmentation is widely used in Japanese language processing, and lexical knowledge is crucial for reliable identification of words in text. Although external lexical resources like Wikipedia are potentially useful, segmentation mismatch prevents them from being straightforwardly incorporated into the word segmentation task. If we intentionally violate segmentation standards with the direct incorporation, quantitative evaluation will be no longer feasible. To address this problem, we propose to define a separate task that directly links given texts to an external resource, that is, wikification in the case of Wikipedia. By doing so, we can circumvent segmentation mismatch that may not necessarily be important for downstream applications. As the first step to realize the idea, we design the task of Japanese wikification and construct wikification corpora. We annotated subsets of the Balanced Corpus of Contemporary Written Japanese plus Twitter short messages. We also implement a simple wikifier and investigate its performance on these corpora.
%U https://aclanthology.org/L16-1214
%P 1346-1351
Markdown (Informal)
[Wikification for Scriptio Continua](https://aclanthology.org/L16-1214) (Murawaki & Mori, LREC 2016)
ACL
- Yugo Murawaki and Shinsuke Mori. 2016. Wikification for Scriptio Continua. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16), pages 1346–1351, Portorož, Slovenia. European Language Resources Association (ELRA).