@inproceedings{wang-etal-2010-adapting,
title = "Adapting {C}hinese Word Segmentation for Machine Translation Based on Short Units",
author = "Wang, Yiou and
Uchimoto, Kiyotaka and
Kazama, Jun{'}ichi and
Kruengkrai, Canasai and
Torisawa, Kentaro",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/83_Paper.pdf",
abstract = "In Chinese texts, words composed of single or multiple characters are not separated by spaces, unlike most western languages. Therefore Chinese word segmentation is considered an important first step in machine translation (MT) and its performance impacts MT results. Many factors affect Chinese word segmentations, including the segmentation standards and segmentation strategies. The performance of a corpus-based word segmentation model depends heavily on the quality and the segmentation standard of the training corpora. However, we observed that existing manually annotated Chinese corpora tend to have low segmentation granularity and provide poor morphological information due to the present segmentation standards. In this paper, we introduce a short-unit standard of Chinese word segmentation, which is particularly suitable for machine translation, and propose a semi-automatic method of transforming the existing corpora into the ones that can satisfy our standards. We evaluate the usefulness of our approach on the basis of translation tasks from the technology newswire domain and the scientific paper domain, and demonstrate that it significantly improves the performance of Chinese-Japanese machine translation (over 1.0 BLEU increase).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2010-adapting">
<titleInfo>
<title>Adapting Chinese Word Segmentation for Machine Translation Based on Short Units</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiou</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiyotaka</namePart>
<namePart type="family">Uchimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun’ichi</namePart>
<namePart type="family">Kazama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Canasai</namePart>
<namePart type="family">Kruengkrai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Torisawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In Chinese texts, words composed of single or multiple characters are not separated by spaces, unlike most western languages. Therefore Chinese word segmentation is considered an important first step in machine translation (MT) and its performance impacts MT results. Many factors affect Chinese word segmentations, including the segmentation standards and segmentation strategies. The performance of a corpus-based word segmentation model depends heavily on the quality and the segmentation standard of the training corpora. However, we observed that existing manually annotated Chinese corpora tend to have low segmentation granularity and provide poor morphological information due to the present segmentation standards. In this paper, we introduce a short-unit standard of Chinese word segmentation, which is particularly suitable for machine translation, and propose a semi-automatic method of transforming the existing corpora into the ones that can satisfy our standards. We evaluate the usefulness of our approach on the basis of translation tasks from the technology newswire domain and the scientific paper domain, and demonstrate that it significantly improves the performance of Chinese-Japanese machine translation (over 1.0 BLEU increase).</abstract>
<identifier type="citekey">wang-etal-2010-adapting</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/83_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adapting Chinese Word Segmentation for Machine Translation Based on Short Units
%A Wang, Yiou
%A Uchimoto, Kiyotaka
%A Kazama, Jun’ichi
%A Kruengkrai, Canasai
%A Torisawa, Kentaro
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F wang-etal-2010-adapting
%X In Chinese texts, words composed of single or multiple characters are not separated by spaces, unlike most western languages. Therefore Chinese word segmentation is considered an important first step in machine translation (MT) and its performance impacts MT results. Many factors affect Chinese word segmentations, including the segmentation standards and segmentation strategies. The performance of a corpus-based word segmentation model depends heavily on the quality and the segmentation standard of the training corpora. However, we observed that existing manually annotated Chinese corpora tend to have low segmentation granularity and provide poor morphological information due to the present segmentation standards. In this paper, we introduce a short-unit standard of Chinese word segmentation, which is particularly suitable for machine translation, and propose a semi-automatic method of transforming the existing corpora into the ones that can satisfy our standards. We evaluate the usefulness of our approach on the basis of translation tasks from the technology newswire domain and the scientific paper domain, and demonstrate that it significantly improves the performance of Chinese-Japanese machine translation (over 1.0 BLEU increase).
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/83_Paper.pdf
Markdown (Informal)
[Adapting Chinese Word Segmentation for Machine Translation Based on Short Units](http://www.lrec-conf.org/proceedings/lrec2010/pdf/83_Paper.pdf) (Wang et al., LREC 2010)
ACL