@inproceedings{neubig-mori-2010-word,
    title = "Word-based Partial Annotation for Efficient Corpus Construction",
    author = "Neubig, Graham  and
      Mori, Shinsuke",
    editor = "Calzolari, Nicoletta  and
      Choukri, Khalid  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Odijk, Jan  and
      Piperidis, Stelios  and
      Rosner, Mike  and
      Tapias, Daniel",
    booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
    month = may,
    year = "2010",
    address = "Valletta, Malta",
    publisher = "European Language Resources Association (ELRA)",
    url = "https://aclanthology.org/L10-1281/",
    abstract = "In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="neubig-mori-2010-word">
    <titleInfo>
        <title>Word-based Partial Annotation for Efficient Corpus Construction</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Graham</namePart>
        <namePart type="family">Neubig</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Shinsuke</namePart>
        <namePart type="family">Mori</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2010-05</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Nicoletta</namePart>
            <namePart type="family">Calzolari</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Khalid</namePart>
            <namePart type="family">Choukri</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Bente</namePart>
            <namePart type="family">Maegaard</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Joseph</namePart>
            <namePart type="family">Mariani</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Jan</namePart>
            <namePart type="family">Odijk</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Stelios</namePart>
            <namePart type="family">Piperidis</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Mike</namePart>
            <namePart type="family">Rosner</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Daniel</namePart>
            <namePart type="family">Tapias</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>European Language Resources Association (ELRA)</publisher>
            <place>
                <placeTerm type="text">Valletta, Malta</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
    </relatedItem>
    <abstract>In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques.</abstract>
    <identifier type="citekey">neubig-mori-2010-word</identifier>
    <location>
        <url>https://aclanthology.org/L10-1281/</url>
    </location>
    <part>
        <date>2010-05</date>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Word-based Partial Annotation for Efficient Corpus Construction
%A Neubig, Graham
%A Mori, Shinsuke
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F neubig-mori-2010-word
%X In order to utilize the corpus-based techniques that have proven effective in natural language processing in recent years, costly and time-consuming manual creation of linguistic resources is often necessary. Traditionally these resources are created on the document or sentence-level. In this paper, we examine the benefit of annotating only particular words with high information content, as opposed to the entire sentence or document. Using the task of Japanese pronunciation estimation as an example, we devise a machine learning method that can be trained on data annotated word-by-word. This is done by dividing the estimation process into two steps (word segmentation and word-based pronunciation estimation), and introducing a point-wise estimator that is able to make each decision independent of the other decisions made for a particular sentence. In an evaluation, the proposed strategy is shown to provide greater increases in accuracy using a smaller number of annotated words than traditional sentence-based annotation techniques.
%U https://aclanthology.org/L10-1281/
Markdown (Informal)
[Word-based Partial Annotation for Efficient Corpus Construction](https://aclanthology.org/L10-1281/) (Neubig & Mori, LREC 2010)
ACL