@inproceedings{kramchaninova-defauw-2022-synthetic,
title = "Synthetic Data Generation for Multilingual Domain-Adaptable Question Answering Systems",
author = "Kramchaninova, Alina and
Defauw, Arne",
editor = {Moniz, Helena and
Macken, Lieve and
Rufener, Andrew and
Barrault, Lo{\"i}c and
Costa-juss{\`a}, Marta R. and
Declercq, Christophe and
Koponen, Maarit and
Kemp, Ellie and
Pilos, Spyridon and
Forcada, Mikel L. and
Scarton, Carolina and
Van den Bogaert, Joachim and
Daems, Joke and
Tezcan, Arda and
Vanroy, Bram and
Fonteyne, Margot},
booktitle = "Proceedings of the 23rd Annual Conference of the European Association for Machine Translation",
month = jun,
year = "2022",
address = "Ghent, Belgium",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2022.eamt-1.18/",
pages = "151--160",
abstract = "Deep learning models have significantly advanced the state of the art of question answering systems. However, the majority of datasets available for training such models have been annotated by humans, are open-domain, and are composed primarily in English. To deal with these limitations, we introduce a pipeline that creates synthetic data from natural text. To illustrate the domain-adaptability of our approach, as well as its multilingual potential, we use our pipeline to obtain synthetic data in English and Dutch. We combine the synthetic data with non-synthetic data (SQuAD 2.0) and evaluate multilingual BERT models on the question answering task. Models trained with synthetically augmented data demonstrate a clear improvement in performance when evaluated on the domain-specific test set, compared to the models trained exclusively on SQuAD 2.0. We expect our work to be beneficial for training domain-specific question-answering systems when the amount of available data is limited."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kramchaninova-defauw-2022-synthetic">
<titleInfo>
<title>Synthetic Data Generation for Multilingual Domain-Adaptable Question Answering Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alina</namePart>
<namePart type="family">Kramchaninova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arne</namePart>
<namePart type="family">Defauw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Annual Conference of the European Association for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lieve</namePart>
<namePart type="family">Macken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Rufener</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loïc</namePart>
<namePart type="family">Barrault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Declercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maarit</namePart>
<namePart type="family">Koponen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ellie</namePart>
<namePart type="family">Kemp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Spyridon</namePart>
<namePart type="family">Pilos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joachim</namePart>
<namePart type="family">Van den Bogaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joke</namePart>
<namePart type="family">Daems</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arda</namePart>
<namePart type="family">Tezcan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bram</namePart>
<namePart type="family">Vanroy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margot</namePart>
<namePart type="family">Fonteyne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Ghent, Belgium</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deep learning models have significantly advanced the state of the art of question answering systems. However, the majority of datasets available for training such models have been annotated by humans, are open-domain, and are composed primarily in English. To deal with these limitations, we introduce a pipeline that creates synthetic data from natural text. To illustrate the domain-adaptability of our approach, as well as its multilingual potential, we use our pipeline to obtain synthetic data in English and Dutch. We combine the synthetic data with non-synthetic data (SQuAD 2.0) and evaluate multilingual BERT models on the question answering task. Models trained with synthetically augmented data demonstrate a clear improvement in performance when evaluated on the domain-specific test set, compared to the models trained exclusively on SQuAD 2.0. We expect our work to be beneficial for training domain-specific question-answering systems when the amount of available data is limited.</abstract>
<identifier type="citekey">kramchaninova-defauw-2022-synthetic</identifier>
<location>
<url>https://aclanthology.org/2022.eamt-1.18/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>151</start>
<end>160</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Data Generation for Multilingual Domain-Adaptable Question Answering Systems
%A Kramchaninova, Alina
%A Defauw, Arne
%Y Moniz, Helena
%Y Macken, Lieve
%Y Rufener, Andrew
%Y Barrault, Loïc
%Y Costa-jussà, Marta R.
%Y Declercq, Christophe
%Y Koponen, Maarit
%Y Kemp, Ellie
%Y Pilos, Spyridon
%Y Forcada, Mikel L.
%Y Scarton, Carolina
%Y Van den Bogaert, Joachim
%Y Daems, Joke
%Y Tezcan, Arda
%Y Vanroy, Bram
%Y Fonteyne, Margot
%S Proceedings of the 23rd Annual Conference of the European Association for Machine Translation
%D 2022
%8 June
%I European Association for Machine Translation
%C Ghent, Belgium
%F kramchaninova-defauw-2022-synthetic
%X Deep learning models have significantly advanced the state of the art of question answering systems. However, the majority of datasets available for training such models have been annotated by humans, are open-domain, and are composed primarily in English. To deal with these limitations, we introduce a pipeline that creates synthetic data from natural text. To illustrate the domain-adaptability of our approach, as well as its multilingual potential, we use our pipeline to obtain synthetic data in English and Dutch. We combine the synthetic data with non-synthetic data (SQuAD 2.0) and evaluate multilingual BERT models on the question answering task. Models trained with synthetically augmented data demonstrate a clear improvement in performance when evaluated on the domain-specific test set, compared to the models trained exclusively on SQuAD 2.0. We expect our work to be beneficial for training domain-specific question-answering systems when the amount of available data is limited.
%U https://aclanthology.org/2022.eamt-1.18/
%P 151-160
Markdown (Informal)
[Synthetic Data Generation for Multilingual Domain-Adaptable Question Answering Systems](https://aclanthology.org/2022.eamt-1.18/) (Kramchaninova & Defauw, EAMT 2022)
ACL