@inproceedings{li-fukumoto-2019-dataset,
title = "A Dataset of Crowdsourced Word Sequences: Collections and Answer Aggregation for Ground Truth Creation",
author = "Li, Jiyi and
Fukumoto, Fumiyo",
editor = "Paun, Silviu and
Hovy, Dirk",
booktitle = "Proceedings of the First Workshop on Aggregating and Analysing Crowdsourced Annotations for NLP",
month = nov,
year = "2019",
address = "Hong Kong",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-5904",
doi = "10.18653/v1/D19-5904",
pages = "24--28",
abstract = "The target outputs of many NLP tasks are word sequences. To collect the data for training and evaluating models, the crowd is a cheaper and easier to access than the oracle. To ensure the quality of the crowdsourced data, people can assign multiple workers to one question and then aggregate the multiple answers with diverse quality into a golden one. How to aggregate multiple crowdsourced word sequences with diverse quality is a curious and challenging problem. People need a dataset for addressing this problem. We thus create a dataset (CrowdWSA2019) which contains the translated sentences generated from multiple workers. We provide three approaches as the baselines on the task of extractive word sequence aggregation. Specially, one of them is an original one we propose which models the reliability of workers. We also discuss some issues on ground truth creation of word sequences which can be addressed based on this dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-fukumoto-2019-dataset">
<titleInfo>
<title>A Dataset of Crowdsourced Word Sequences: Collections and Answer Aggregation for Ground Truth Creation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiyi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fumiyo</namePart>
<namePart type="family">Fukumoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Aggregating and Analysing Crowdsourced Annotations for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Silviu</namePart>
<namePart type="family">Paun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The target outputs of many NLP tasks are word sequences. To collect the data for training and evaluating models, the crowd is a cheaper and easier to access than the oracle. To ensure the quality of the crowdsourced data, people can assign multiple workers to one question and then aggregate the multiple answers with diverse quality into a golden one. How to aggregate multiple crowdsourced word sequences with diverse quality is a curious and challenging problem. People need a dataset for addressing this problem. We thus create a dataset (CrowdWSA2019) which contains the translated sentences generated from multiple workers. We provide three approaches as the baselines on the task of extractive word sequence aggregation. Specially, one of them is an original one we propose which models the reliability of workers. We also discuss some issues on ground truth creation of word sequences which can be addressed based on this dataset.</abstract>
<identifier type="citekey">li-fukumoto-2019-dataset</identifier>
<identifier type="doi">10.18653/v1/D19-5904</identifier>
<location>
<url>https://aclanthology.org/D19-5904</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>24</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Dataset of Crowdsourced Word Sequences: Collections and Answer Aggregation for Ground Truth Creation
%A Li, Jiyi
%A Fukumoto, Fumiyo
%Y Paun, Silviu
%Y Hovy, Dirk
%S Proceedings of the First Workshop on Aggregating and Analysing Crowdsourced Annotations for NLP
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong
%F li-fukumoto-2019-dataset
%X The target outputs of many NLP tasks are word sequences. To collect the data for training and evaluating models, the crowd is a cheaper and easier to access than the oracle. To ensure the quality of the crowdsourced data, people can assign multiple workers to one question and then aggregate the multiple answers with diverse quality into a golden one. How to aggregate multiple crowdsourced word sequences with diverse quality is a curious and challenging problem. People need a dataset for addressing this problem. We thus create a dataset (CrowdWSA2019) which contains the translated sentences generated from multiple workers. We provide three approaches as the baselines on the task of extractive word sequence aggregation. Specially, one of them is an original one we propose which models the reliability of workers. We also discuss some issues on ground truth creation of word sequences which can be addressed based on this dataset.
%R 10.18653/v1/D19-5904
%U https://aclanthology.org/D19-5904
%U https://doi.org/10.18653/v1/D19-5904
%P 24-28
Markdown (Informal)
[A Dataset of Crowdsourced Word Sequences: Collections and Answer Aggregation for Ground Truth Creation](https://aclanthology.org/D19-5904) (Li & Fukumoto, 2019)
ACL