@inproceedings{cao-khudanpur-2012-sample,
title = "Sample Selection for Large-scale {MT} Discriminative Training",
author = "Cao, Yuan and
Khudanpur, Sanjeev",
booktitle = "Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers",
month = oct # " 28-" # nov # " 1",
year = "2012",
address = "San Diego, California, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2012.amta-papers.3",
abstract = "Discriminative training for MT usually involves numerous features and requires large-scale training set to reach reliable parameter estimation. Other than using the expensive human-labeled parallel corpora for training, semi-supervised methods have been proposed to generate huge amount of {``}hallucinated{''} data which relieves the data sparsity problem. However the large training set contains both good samples which are suitable for training and bad ones harmful to the training. How to select training samples from vast amount of data can greatly affect the training performance. In this paper we propose a method for selecting samples that are most suitable for discriminative training according to a criterion measuring the dataset quality. Our experimental results show that by adding samples to the training set selectively, we are able to exceed the performance of system trained with the same amount of samples selected randomly.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cao-khudanpur-2012-sample">
<titleInfo>
<title>Sample Selection for Large-scale MT Discriminative Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjeev</namePart>
<namePart type="family">Khudanpur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-oct 28-nov 1</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers</title>
</titleInfo>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Discriminative training for MT usually involves numerous features and requires large-scale training set to reach reliable parameter estimation. Other than using the expensive human-labeled parallel corpora for training, semi-supervised methods have been proposed to generate huge amount of “hallucinated” data which relieves the data sparsity problem. However the large training set contains both good samples which are suitable for training and bad ones harmful to the training. How to select training samples from vast amount of data can greatly affect the training performance. In this paper we propose a method for selecting samples that are most suitable for discriminative training according to a criterion measuring the dataset quality. Our experimental results show that by adding samples to the training set selectively, we are able to exceed the performance of system trained with the same amount of samples selected randomly.</abstract>
<identifier type="citekey">cao-khudanpur-2012-sample</identifier>
<location>
<url>https://aclanthology.org/2012.amta-papers.3</url>
</location>
<part>
<date>2012-oct 28-nov 1</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sample Selection for Large-scale MT Discriminative Training
%A Cao, Yuan
%A Khudanpur, Sanjeev
%S Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers
%D 2012
%8 oct 28 nov 1
%I Association for Machine Translation in the Americas
%C San Diego, California, USA
%F cao-khudanpur-2012-sample
%X Discriminative training for MT usually involves numerous features and requires large-scale training set to reach reliable parameter estimation. Other than using the expensive human-labeled parallel corpora for training, semi-supervised methods have been proposed to generate huge amount of “hallucinated” data which relieves the data sparsity problem. However the large training set contains both good samples which are suitable for training and bad ones harmful to the training. How to select training samples from vast amount of data can greatly affect the training performance. In this paper we propose a method for selecting samples that are most suitable for discriminative training according to a criterion measuring the dataset quality. Our experimental results show that by adding samples to the training set selectively, we are able to exceed the performance of system trained with the same amount of samples selected randomly.
%U https://aclanthology.org/2012.amta-papers.3
Markdown (Informal)
[Sample Selection for Large-scale MT Discriminative Training](https://aclanthology.org/2012.amta-papers.3) (Cao & Khudanpur, AMTA 2012)
ACL