@inproceedings{kim-etal-2022-generating,
title = "Generating Information-Seeking Conversations from Unlabeled Documents",
author = "Kim, Gangwoo and
Kim, Sungdong and
Yoo, Kang Min and
Kang, Jaewoo",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.151",
doi = "10.18653/v1/2022.emnlp-main.151",
pages = "2362--2378",
abstract = "Synthesizing datasets for conversational question answering (CQA) from unlabeled documents remains challenging due to its interactive nature.Moreover, while modeling information needs is an essential key, only few studies have discussed it.In this paper, we introduce a novel framework, **SimSeek**, (**Sim**ulating information-**Seek**ing conversation from unlabeled documents), and compare its two variants.In our baseline, **SimSeek-sym**, a questioner generates follow-up questions upon the predetermined answer by an answerer.On the contrary, **SimSeek-asym** first generates the question and then finds its corresponding answer under the conversational context.Our experiments show that they can synthesize effective training resources for CQA and conversational search tasks.As a result, conversations from **SimSeek-asym** not only make more improvements in our experiments but also are favorably reviewed in a human evaluation.We finally release a large-scale resource of synthetic conversations, **Wiki-SimSeek**, containing 2 million CQA pairs built upon Wikipedia documents.With the dataset, our CQA model achieves the state-of-the-art performance on a recent CQA benchmark, QuAC.The code and dataset are available at https://github.com/naver-ai/simseek",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kim-etal-2022-generating">
<titleInfo>
<title>Generating Information-Seeking Conversations from Unlabeled Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gangwoo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungdong</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="given">Min</namePart>
<namePart type="family">Yoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaewoo</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Synthesizing datasets for conversational question answering (CQA) from unlabeled documents remains challenging due to its interactive nature.Moreover, while modeling information needs is an essential key, only few studies have discussed it.In this paper, we introduce a novel framework, **SimSeek**, (**Sim**ulating information-**Seek**ing conversation from unlabeled documents), and compare its two variants.In our baseline, **SimSeek-sym**, a questioner generates follow-up questions upon the predetermined answer by an answerer.On the contrary, **SimSeek-asym** first generates the question and then finds its corresponding answer under the conversational context.Our experiments show that they can synthesize effective training resources for CQA and conversational search tasks.As a result, conversations from **SimSeek-asym** not only make more improvements in our experiments but also are favorably reviewed in a human evaluation.We finally release a large-scale resource of synthetic conversations, **Wiki-SimSeek**, containing 2 million CQA pairs built upon Wikipedia documents.With the dataset, our CQA model achieves the state-of-the-art performance on a recent CQA benchmark, QuAC.The code and dataset are available at https://github.com/naver-ai/simseek</abstract>
<identifier type="citekey">kim-etal-2022-generating</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.151</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.151</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>2362</start>
<end>2378</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Information-Seeking Conversations from Unlabeled Documents
%A Kim, Gangwoo
%A Kim, Sungdong
%A Yoo, Kang Min
%A Kang, Jaewoo
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F kim-etal-2022-generating
%X Synthesizing datasets for conversational question answering (CQA) from unlabeled documents remains challenging due to its interactive nature.Moreover, while modeling information needs is an essential key, only few studies have discussed it.In this paper, we introduce a novel framework, **SimSeek**, (**Sim**ulating information-**Seek**ing conversation from unlabeled documents), and compare its two variants.In our baseline, **SimSeek-sym**, a questioner generates follow-up questions upon the predetermined answer by an answerer.On the contrary, **SimSeek-asym** first generates the question and then finds its corresponding answer under the conversational context.Our experiments show that they can synthesize effective training resources for CQA and conversational search tasks.As a result, conversations from **SimSeek-asym** not only make more improvements in our experiments but also are favorably reviewed in a human evaluation.We finally release a large-scale resource of synthetic conversations, **Wiki-SimSeek**, containing 2 million CQA pairs built upon Wikipedia documents.With the dataset, our CQA model achieves the state-of-the-art performance on a recent CQA benchmark, QuAC.The code and dataset are available at https://github.com/naver-ai/simseek
%R 10.18653/v1/2022.emnlp-main.151
%U https://aclanthology.org/2022.emnlp-main.151
%U https://doi.org/10.18653/v1/2022.emnlp-main.151
%P 2362-2378
Markdown (Informal)
[Generating Information-Seeking Conversations from Unlabeled Documents](https://aclanthology.org/2022.emnlp-main.151) (Kim et al., EMNLP 2022)
ACL