@inproceedings{hsieh-2014-chinese,
title = "Why {C}hinese Web-as-Corpus is Wacky? Or: How Big Data is Killing {C}hinese Corpus Linguistics",
author = "Hsieh, Shu-Kai",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/843_Paper.pdf",
pages = "2386--2389",
abstract = "This paper aims to examine and evaluate the current development of using Web-as-Corpus (WaC) paradigm in Chinese corpus linguistics. I will argue that the unstable notion of wordhood in Chinese and the resulting diverse ideas of implementing word segmentation systems have posed great challenges for those who are keen on building web-scaled corpus data. Two lexical measures are proposed to illustrate the issues and methodological discussions are provided.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hsieh-2014-chinese">
<titleInfo>
<title>Why Chinese Web-as-Corpus is Wacky? Or: How Big Data is Killing Chinese Corpus Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shu-Kai</namePart>
<namePart type="family">Hsieh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper aims to examine and evaluate the current development of using Web-as-Corpus (WaC) paradigm in Chinese corpus linguistics. I will argue that the unstable notion of wordhood in Chinese and the resulting diverse ideas of implementing word segmentation systems have posed great challenges for those who are keen on building web-scaled corpus data. Two lexical measures are proposed to illustrate the issues and methodological discussions are provided.</abstract>
<identifier type="citekey">hsieh-2014-chinese</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/843_Paper.pdf</url>
</location>
<part>
<date>2014-05</date>
<extent unit="page">
<start>2386</start>
<end>2389</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Why Chinese Web-as-Corpus is Wacky? Or: How Big Data is Killing Chinese Corpus Linguistics
%A Hsieh, Shu-Kai
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F hsieh-2014-chinese
%X This paper aims to examine and evaluate the current development of using Web-as-Corpus (WaC) paradigm in Chinese corpus linguistics. I will argue that the unstable notion of wordhood in Chinese and the resulting diverse ideas of implementing word segmentation systems have posed great challenges for those who are keen on building web-scaled corpus data. Two lexical measures are proposed to illustrate the issues and methodological discussions are provided.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/843_Paper.pdf
%P 2386-2389
Markdown (Informal)
[Why Chinese Web-as-Corpus is Wacky? Or: How Big Data is Killing Chinese Corpus Linguistics](http://www.lrec-conf.org/proceedings/lrec2014/pdf/843_Paper.pdf) (Hsieh, LREC 2014)
ACL