@inproceedings{chauhan-etal-2025-mind,
title = "Mind the Query: A Benchmark Dataset towards {T}ext2{C}ypher Task",
author = "Chauhan, Vashu and
Raj, Shobhit and
Mujumdar, Shashank and
Saha, Avirup and
Jain, Anannay",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.133/",
pages = "1890--1905",
ISBN = "979-8-89176-333-3",
abstract = "We present a high-quality, multi-domain dataset for the Text2Cypher task which is enabling the translation of natural language (NL) questions into executable Cypher queries over graph databases. The dataset comprises 27,529 NL queries and corresponding Cyphers spanning across 11 real-world graph datasets, each accompanied by its corresponding graph database for grounded query execution. To ensure correctness, the queries are validated through a rigorous pipeline combining automated schema, runtime and value checks, along with manual review for logical correctness. Queries are further categorized by complexity to support fine-grained evaluation. We have released our benchmark dataset and code to replicate our data synthesis pipeline on new graph datasets, supporting extensibility and future research for the task of Text2Cypher."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chauhan-etal-2025-mind">
<titleInfo>
<title>Mind the Query: A Benchmark Dataset towards Text2Cypher Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vashu</namePart>
<namePart type="family">Chauhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shobhit</namePart>
<namePart type="family">Raj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashank</namePart>
<namePart type="family">Mujumdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avirup</namePart>
<namePart type="family">Saha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anannay</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>We present a high-quality, multi-domain dataset for the Text2Cypher task which is enabling the translation of natural language (NL) questions into executable Cypher queries over graph databases. The dataset comprises 27,529 NL queries and corresponding Cyphers spanning across 11 real-world graph datasets, each accompanied by its corresponding graph database for grounded query execution. To ensure correctness, the queries are validated through a rigorous pipeline combining automated schema, runtime and value checks, along with manual review for logical correctness. Queries are further categorized by complexity to support fine-grained evaluation. We have released our benchmark dataset and code to replicate our data synthesis pipeline on new graph datasets, supporting extensibility and future research for the task of Text2Cypher.</abstract>
<identifier type="citekey">chauhan-etal-2025-mind</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.133/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1890</start>
<end>1905</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mind the Query: A Benchmark Dataset towards Text2Cypher Task
%A Chauhan, Vashu
%A Raj, Shobhit
%A Mujumdar, Shashank
%A Saha, Avirup
%A Jain, Anannay
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F chauhan-etal-2025-mind
%X We present a high-quality, multi-domain dataset for the Text2Cypher task which is enabling the translation of natural language (NL) questions into executable Cypher queries over graph databases. The dataset comprises 27,529 NL queries and corresponding Cyphers spanning across 11 real-world graph datasets, each accompanied by its corresponding graph database for grounded query execution. To ensure correctness, the queries are validated through a rigorous pipeline combining automated schema, runtime and value checks, along with manual review for logical correctness. Queries are further categorized by complexity to support fine-grained evaluation. We have released our benchmark dataset and code to replicate our data synthesis pipeline on new graph datasets, supporting extensibility and future research for the task of Text2Cypher.
%U https://aclanthology.org/2025.emnlp-industry.133/
%P 1890-1905
Markdown (Informal)
[Mind the Query: A Benchmark Dataset towards Text2Cypher Task](https://aclanthology.org/2025.emnlp-industry.133/) (Chauhan et al., EMNLP 2025)
ACL
- Vashu Chauhan, Shobhit Raj, Shashank Mujumdar, Avirup Saha, and Anannay Jain. 2025. Mind the Query: A Benchmark Dataset towards Text2Cypher Task. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1890–1905, Suzhou (China). Association for Computational Linguistics.