@inproceedings{buhmann-etal-2022-domain,
title = "Domain- and Task-Adaptation for {V}accin{C}hat{NL}, a {D}utch {COVID}-19 {FAQ} Answering Corpus and Classification Model",
author = "Buhmann, Jeska and
De Bruyn, Maxime and
Lotfi, Ehsan and
Daelemans, Walter",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.312/",
pages = "3539--3549",
abstract = "FAQs are important resources to find information. However, especially if a FAQ concerns many question-answer pairs, it can be a difficult and time-consuming job to find the answer you are looking for. A FAQ chatbot can ease this process by automatically retrieving the relevant answer to a user`s question. We present VaccinChatNL, a Dutch FAQ corpus on the topic of COVID-19 vaccination. Starting with 50 question-answer pairs we built VaccinChat, a FAQ chatbot, which we used to gather more user questions that were also annotated with the appropriate or new answer classes. This iterative process of gathering user questions, annotating them, and retraining the model with the increased data set led to a corpus that now contains 12,883 user questions divided over 181 answers. We provide the first publicly available Dutch FAQ answering data set of this size with large groups of semantically equivalent human-paraphrased questions. Furthermore, our study shows that before fine-tuning a classifier, continued pre-training of Dutch language models with task- and/or domain-specific data improves classification results. In addition, we show that large groups of semantically similar questions are important for obtaining well-performing intent classification models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="buhmann-etal-2022-domain">
<titleInfo>
<title>Domain- and Task-Adaptation for VaccinChatNL, a Dutch COVID-19 FAQ Answering Corpus and Classification Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeska</namePart>
<namePart type="family">Buhmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxime</namePart>
<namePart type="family">De Bruyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Lotfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>FAQs are important resources to find information. However, especially if a FAQ concerns many question-answer pairs, it can be a difficult and time-consuming job to find the answer you are looking for. A FAQ chatbot can ease this process by automatically retrieving the relevant answer to a user‘s question. We present VaccinChatNL, a Dutch FAQ corpus on the topic of COVID-19 vaccination. Starting with 50 question-answer pairs we built VaccinChat, a FAQ chatbot, which we used to gather more user questions that were also annotated with the appropriate or new answer classes. This iterative process of gathering user questions, annotating them, and retraining the model with the increased data set led to a corpus that now contains 12,883 user questions divided over 181 answers. We provide the first publicly available Dutch FAQ answering data set of this size with large groups of semantically equivalent human-paraphrased questions. Furthermore, our study shows that before fine-tuning a classifier, continued pre-training of Dutch language models with task- and/or domain-specific data improves classification results. In addition, we show that large groups of semantically similar questions are important for obtaining well-performing intent classification models.</abstract>
<identifier type="citekey">buhmann-etal-2022-domain</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.312/</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>3539</start>
<end>3549</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Domain- and Task-Adaptation for VaccinChatNL, a Dutch COVID-19 FAQ Answering Corpus and Classification Model
%A Buhmann, Jeska
%A De Bruyn, Maxime
%A Lotfi, Ehsan
%A Daelemans, Walter
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F buhmann-etal-2022-domain
%X FAQs are important resources to find information. However, especially if a FAQ concerns many question-answer pairs, it can be a difficult and time-consuming job to find the answer you are looking for. A FAQ chatbot can ease this process by automatically retrieving the relevant answer to a user‘s question. We present VaccinChatNL, a Dutch FAQ corpus on the topic of COVID-19 vaccination. Starting with 50 question-answer pairs we built VaccinChat, a FAQ chatbot, which we used to gather more user questions that were also annotated with the appropriate or new answer classes. This iterative process of gathering user questions, annotating them, and retraining the model with the increased data set led to a corpus that now contains 12,883 user questions divided over 181 answers. We provide the first publicly available Dutch FAQ answering data set of this size with large groups of semantically equivalent human-paraphrased questions. Furthermore, our study shows that before fine-tuning a classifier, continued pre-training of Dutch language models with task- and/or domain-specific data improves classification results. In addition, we show that large groups of semantically similar questions are important for obtaining well-performing intent classification models.
%U https://aclanthology.org/2022.coling-1.312/
%P 3539-3549
Markdown (Informal)
[Domain- and Task-Adaptation for VaccinChatNL, a Dutch COVID-19 FAQ Answering Corpus and Classification Model](https://aclanthology.org/2022.coling-1.312/) (Buhmann et al., COLING 2022)
ACL