@inproceedings{kumar-etal-2022-controlled,
title = "Controlled Data Generation via Insertion Operations for {NLU}",
author = "Kumar, Manoj and
Merhav, Yuval and
Khan, Haidar and
Gupta, Rahul and
Rumshisky, Anna and
Hamza, Wael",
editor = "Loukina, Anastassia and
Gangadharaiah, Rashmi and
Min, Bonan",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track",
month = jul,
year = "2022",
address = "Hybrid: Seattle, Washington + Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-industry.7",
doi = "10.18653/v1/2022.naacl-industry.7",
pages = "54--61",
abstract = "Use of synthetic data is rapidly emerging as a realistic alternative to manually annotating live traffic for industry-scale model building. Manual data annotation is slow, expensive and not preferred for meeting customer privacy expectations. Further, commercial natural language applications are required to support continuously evolving features as well as newly added experiences. To address these requirements, we propose a targeted synthetic data generation technique by inserting tokens into a given semantic signature. The generated data are used as additional training samples in the tasks of intent classification and named entity recognition. We evaluate on a real-world voice assistant dataset, and using only 33{\%} of the available training set, we achieve the same accuracy as training with all available data. Further, we analyze the effects of data generation across varied real-world applications and propose heuristics that improve the task performance further.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2022-controlled">
<titleInfo>
<title>Controlled Data Generation via Insertion Operations for NLU</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manoj</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuval</namePart>
<namePart type="family">Merhav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haidar</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rumshisky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wael</namePart>
<namePart type="family">Hamza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anastassia</namePart>
<namePart type="family">Loukina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rashmi</namePart>
<namePart type="family">Gangadharaiah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bonan</namePart>
<namePart type="family">Min</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hybrid: Seattle, Washington + Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Use of synthetic data is rapidly emerging as a realistic alternative to manually annotating live traffic for industry-scale model building. Manual data annotation is slow, expensive and not preferred for meeting customer privacy expectations. Further, commercial natural language applications are required to support continuously evolving features as well as newly added experiences. To address these requirements, we propose a targeted synthetic data generation technique by inserting tokens into a given semantic signature. The generated data are used as additional training samples in the tasks of intent classification and named entity recognition. We evaluate on a real-world voice assistant dataset, and using only 33% of the available training set, we achieve the same accuracy as training with all available data. Further, we analyze the effects of data generation across varied real-world applications and propose heuristics that improve the task performance further.</abstract>
<identifier type="citekey">kumar-etal-2022-controlled</identifier>
<identifier type="doi">10.18653/v1/2022.naacl-industry.7</identifier>
<location>
<url>https://aclanthology.org/2022.naacl-industry.7</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>54</start>
<end>61</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Controlled Data Generation via Insertion Operations for NLU
%A Kumar, Manoj
%A Merhav, Yuval
%A Khan, Haidar
%A Gupta, Rahul
%A Rumshisky, Anna
%A Hamza, Wael
%Y Loukina, Anastassia
%Y Gangadharaiah, Rashmi
%Y Min, Bonan
%S Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track
%D 2022
%8 July
%I Association for Computational Linguistics
%C Hybrid: Seattle, Washington + Online
%F kumar-etal-2022-controlled
%X Use of synthetic data is rapidly emerging as a realistic alternative to manually annotating live traffic for industry-scale model building. Manual data annotation is slow, expensive and not preferred for meeting customer privacy expectations. Further, commercial natural language applications are required to support continuously evolving features as well as newly added experiences. To address these requirements, we propose a targeted synthetic data generation technique by inserting tokens into a given semantic signature. The generated data are used as additional training samples in the tasks of intent classification and named entity recognition. We evaluate on a real-world voice assistant dataset, and using only 33% of the available training set, we achieve the same accuracy as training with all available data. Further, we analyze the effects of data generation across varied real-world applications and propose heuristics that improve the task performance further.
%R 10.18653/v1/2022.naacl-industry.7
%U https://aclanthology.org/2022.naacl-industry.7
%U https://doi.org/10.18653/v1/2022.naacl-industry.7
%P 54-61
Markdown (Informal)
[Controlled Data Generation via Insertion Operations for NLU](https://aclanthology.org/2022.naacl-industry.7) (Kumar et al., NAACL 2022)
ACL
- Manoj Kumar, Yuval Merhav, Haidar Khan, Rahul Gupta, Anna Rumshisky, and Wael Hamza. 2022. Controlled Data Generation via Insertion Operations for NLU. In Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Track, pages 54–61, Hybrid: Seattle, Washington + Online. Association for Computational Linguistics.