@inproceedings{bakshandaeva-etal-2022-pauq,
title = "{PAUQ}: Text-to-{SQL} in {R}ussian",
author = "Bakshandaeva, Daria and
Somov, Oleg and
Dmitrieva, Ekaterina and
Davydova, Vera and
Tutubalina, Elena",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.175",
doi = "10.18653/v1/2022.findings-emnlp.175",
pages = "2355--2376",
abstract = "Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models{'} abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bakshandaeva-etal-2022-pauq">
<titleInfo>
<title>PAUQ: Text-to-SQL in Russian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daria</namePart>
<namePart type="family">Bakshandaeva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Somov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Dmitrieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Davydova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Tutubalina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models’ abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.</abstract>
<identifier type="citekey">bakshandaeva-etal-2022-pauq</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.175</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.175</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>2355</start>
<end>2376</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PAUQ: Text-to-SQL in Russian
%A Bakshandaeva, Daria
%A Somov, Oleg
%A Dmitrieva, Ekaterina
%A Davydova, Vera
%A Tutubalina, Elena
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F bakshandaeva-etal-2022-pauq
%X Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models’ abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.
%R 10.18653/v1/2022.findings-emnlp.175
%U https://aclanthology.org/2022.findings-emnlp.175
%U https://doi.org/10.18653/v1/2022.findings-emnlp.175
%P 2355-2376
Markdown (Informal)
[PAUQ: Text-to-SQL in Russian](https://aclanthology.org/2022.findings-emnlp.175) (Bakshandaeva et al., Findings 2022)
ACL
- Daria Bakshandaeva, Oleg Somov, Ekaterina Dmitrieva, Vera Davydova, and Elena Tutubalina. 2022. PAUQ: Text-to-SQL in Russian. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 2355–2376, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.