@inproceedings{sato-2014-text,
title = "Text Readability and Word Distribution in {J}apanese",
author = "Sato, Satoshi",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/633_Paper.pdf",
pages = "2811--2815",
abstract = "This paper reports the relation between text readability and word distribution in the Japanese language. There was no similar study in the past due to three major obstacles: (1) unclear definition of Japanese {``}word{''}, (2) no balanced corpus, and (3) no readability measure. Compilation of the Balanced Corpus of Contemporary Written Japanese (BCCWJ) and development of a readability predictor remove these three obstacles and enable this study. First, we have counted the frequency of each word in each text in the corpus. Then we have calculated the frequency rank of words both in the whole corpus and in each of three readability bands. Three major findings are: (1) the proportion of high-frequent words to tokens in Japanese is lower than that in English; (2) the type-coverage curve of words in the difficult-band draws an unexpected shape; (3) the size of the intersection between high-frequent words in the easy-band and these in the difficult-band is unexpectedly small.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sato-2014-text">
<titleInfo>
<title>Text Readability and Word Distribution in Japanese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Satoshi</namePart>
<namePart type="family">Sato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper reports the relation between text readability and word distribution in the Japanese language. There was no similar study in the past due to three major obstacles: (1) unclear definition of Japanese “word”, (2) no balanced corpus, and (3) no readability measure. Compilation of the Balanced Corpus of Contemporary Written Japanese (BCCWJ) and development of a readability predictor remove these three obstacles and enable this study. First, we have counted the frequency of each word in each text in the corpus. Then we have calculated the frequency rank of words both in the whole corpus and in each of three readability bands. Three major findings are: (1) the proportion of high-frequent words to tokens in Japanese is lower than that in English; (2) the type-coverage curve of words in the difficult-band draws an unexpected shape; (3) the size of the intersection between high-frequent words in the easy-band and these in the difficult-band is unexpectedly small.</abstract>
<identifier type="citekey">sato-2014-text</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/633_Paper.pdf</url>
</location>
<part>
<date>2014-05</date>
<extent unit="page">
<start>2811</start>
<end>2815</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text Readability and Word Distribution in Japanese
%A Sato, Satoshi
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F sato-2014-text
%X This paper reports the relation between text readability and word distribution in the Japanese language. There was no similar study in the past due to three major obstacles: (1) unclear definition of Japanese “word”, (2) no balanced corpus, and (3) no readability measure. Compilation of the Balanced Corpus of Contemporary Written Japanese (BCCWJ) and development of a readability predictor remove these three obstacles and enable this study. First, we have counted the frequency of each word in each text in the corpus. Then we have calculated the frequency rank of words both in the whole corpus and in each of three readability bands. Three major findings are: (1) the proportion of high-frequent words to tokens in Japanese is lower than that in English; (2) the type-coverage curve of words in the difficult-band draws an unexpected shape; (3) the size of the intersection between high-frequent words in the easy-band and these in the difficult-band is unexpectedly small.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/633_Paper.pdf
%P 2811-2815
Markdown (Informal)
[Text Readability and Word Distribution in Japanese](http://www.lrec-conf.org/proceedings/lrec2014/pdf/633_Paper.pdf) (Sato, LREC 2014)
ACL
- Satoshi Sato. 2014. Text Readability and Word Distribution in Japanese. In Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14), pages 2811–2815, Reykjavik, Iceland. European Language Resources Association (ELRA).