@inproceedings{yujie-etal-2024-cost,
title = "Cost-efficient Crowdsourcing for Span-based Sequence Labeling:Worker Selection and Data Augmentation",
author = "Wang, Yujie and
Huang, Chao and
Yang, Liner and
Fang, Zhixuan and
Huang, Yaping and
Liu, Yang and
Yu, Jingsi and
Yang, Erhong",
editor = "Maosong, Sun and
Jiye, Liang and
Xianpei, Han and
Zhiyuan, Liu and
Yulan, He",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2024.ccl-1.96/",
pages = "1239--1256",
language = "eng",
abstract = "``This paper introduces a novel crowdsourcing worker selection algorithm, enhancing annotationquality and reducing costs. Unlike previous studies targeting simpler tasks, this study con-tends with the complexities of label interdependencies in sequence labeling. The proposedalgorithm utilizes a Combinatorial Multi-Armed Bandit (CMAB) approach for worker selec-tion, and a cost-effective human feedback mechanism. The challenge of dealing with imbal-anced and small-scale datasets, which hinders offline simulation of worker selection, is tack-led using an innovative data augmentation method termed shifting, expanding, and shrink-ing (SES). Rigorous testing on CoNLL 2003 NER and Chinese OEI datasets showcased thealgorithm{'}s efficiency, with an increase in F1 score up to 100.04{\%} of the expert-only base-line, alongside cost savings up to 65.97{\%}. The paper also encompasses a dataset-independenttest emulating annotation evaluation through a Bernoulli distribution, which still led to animpressive 97.56{\%} F1 score of the expert baseline and 59.88{\%} cost savings. Furthermore,our approach can be seamlessly integrated into Reinforcement Learning from Human Feed-back (RLHF) systems, offering a cost-effective solution for obtaining human feedback. All re-sources, including source code and datasets, are available to the broader research community athttps://github.com/blcuicall/nlp-crowdsourcing.''"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yujie-etal-2024-cost">
<titleInfo>
<title>Cost-efficient Crowdsourcing for Span-based Sequence Labeling:Worker Selection and Data Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yujie</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liner</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhixuan</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaping</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingsi</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erhong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sun</namePart>
<namePart type="family">Maosong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Jiye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Xianpei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liu</namePart>
<namePart type="family">Zhiyuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">He</namePart>
<namePart type="family">Yulan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Taiyuan, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“This paper introduces a novel crowdsourcing worker selection algorithm, enhancing annotationquality and reducing costs. Unlike previous studies targeting simpler tasks, this study con-tends with the complexities of label interdependencies in sequence labeling. The proposedalgorithm utilizes a Combinatorial Multi-Armed Bandit (CMAB) approach for worker selec-tion, and a cost-effective human feedback mechanism. The challenge of dealing with imbal-anced and small-scale datasets, which hinders offline simulation of worker selection, is tack-led using an innovative data augmentation method termed shifting, expanding, and shrink-ing (SES). Rigorous testing on CoNLL 2003 NER and Chinese OEI datasets showcased thealgorithm’s efficiency, with an increase in F1 score up to 100.04% of the expert-only base-line, alongside cost savings up to 65.97%. The paper also encompasses a dataset-independenttest emulating annotation evaluation through a Bernoulli distribution, which still led to animpressive 97.56% F1 score of the expert baseline and 59.88% cost savings. Furthermore,our approach can be seamlessly integrated into Reinforcement Learning from Human Feed-back (RLHF) systems, offering a cost-effective solution for obtaining human feedback. All re-sources, including source code and datasets, are available to the broader research community athttps://github.com/blcuicall/nlp-crowdsourcing.”</abstract>
<identifier type="citekey">yujie-etal-2024-cost</identifier>
<location>
<url>https://aclanthology.org/2024.ccl-1.96/</url>
</location>
<part>
<date>2024-07</date>
<extent unit="page">
<start>1239</start>
<end>1256</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cost-efficient Crowdsourcing for Span-based Sequence Labeling:Worker Selection and Data Augmentation
%A Wang, Yujie
%A Huang, Chao
%A Yang, Liner
%A Fang, Zhixuan
%A Huang, Yaping
%A Liu, Yang
%A Yu, Jingsi
%A Yang, Erhong
%Y Maosong, Sun
%Y Jiye, Liang
%Y Xianpei, Han
%Y Zhiyuan, Liu
%Y Yulan, He
%S Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)
%D 2024
%8 July
%I Chinese Information Processing Society of China
%C Taiyuan, China
%G eng
%F yujie-etal-2024-cost
%X “This paper introduces a novel crowdsourcing worker selection algorithm, enhancing annotationquality and reducing costs. Unlike previous studies targeting simpler tasks, this study con-tends with the complexities of label interdependencies in sequence labeling. The proposedalgorithm utilizes a Combinatorial Multi-Armed Bandit (CMAB) approach for worker selec-tion, and a cost-effective human feedback mechanism. The challenge of dealing with imbal-anced and small-scale datasets, which hinders offline simulation of worker selection, is tack-led using an innovative data augmentation method termed shifting, expanding, and shrink-ing (SES). Rigorous testing on CoNLL 2003 NER and Chinese OEI datasets showcased thealgorithm’s efficiency, with an increase in F1 score up to 100.04% of the expert-only base-line, alongside cost savings up to 65.97%. The paper also encompasses a dataset-independenttest emulating annotation evaluation through a Bernoulli distribution, which still led to animpressive 97.56% F1 score of the expert baseline and 59.88% cost savings. Furthermore,our approach can be seamlessly integrated into Reinforcement Learning from Human Feed-back (RLHF) systems, offering a cost-effective solution for obtaining human feedback. All re-sources, including source code and datasets, are available to the broader research community athttps://github.com/blcuicall/nlp-crowdsourcing.”
%U https://aclanthology.org/2024.ccl-1.96/
%P 1239-1256
Markdown (Informal)
[Cost-efficient Crowdsourcing for Span-based Sequence Labeling:Worker Selection and Data Augmentation](https://aclanthology.org/2024.ccl-1.96/) (Wang et al., CCL 2024)
ACL
- Yujie Wang, Chao Huang, Liner Yang, Zhixuan Fang, Yaping Huang, Yang Liu, Jingsi Yu, and Erhong Yang. 2024. Cost-efficient Crowdsourcing for Span-based Sequence Labeling:Worker Selection and Data Augmentation. In Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference), pages 1239–1256, Taiyuan, China. Chinese Information Processing Society of China.