@inproceedings{tiwari-etal-2025-auto,
title = "Auto-Cypher: Improving {LLM}s on Cypher generation via {LLM}-supervised generation-verification framework",
author = "Tiwari, Aman and
Malay, Shiva Krishna Reddy and
Yadav, Vikas and
Hashemi, Masoud and
Madhusudhan, Sathwik Tejaswi",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.53/",
doi = "10.18653/v1/2025.naacl-short.53",
pages = "623--640",
ISBN = "979-8-89176-190-2",
abstract = "Graph databases like Neo4j are gaining popularity for handling complex, interconnected data, over traditional relational databases in modeling and querying relationships. While translating natural language into SQL queries is well-researched, generating Cypher queries for Neo4j remains relatively underexplored. In this work, we present an automated, LLM Supervised, pipeline to generate high quality synthetic data for Text2Cypher. Our Cypher data generation pipeline introduces LLM-As-Database-Filler, a novel strategy for ensuring Cypher query correctness, thus resulting in high quality generations. Using our pipeline, we generate high quality Text2Cypher data - SynthCypher containing 29.8k instances across various domains and queries with varying complexities. Training open-source LLMs like LLaMa-3.1-8B, Mistral-7B, and QWEN7B on SynthCypher results in performance gains of up to 40{\%} on the Text2Cypher test split and 30{\%} on the SPIDER benchmark, adapted for graph databases."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tiwari-etal-2025-auto">
<titleInfo>
<title>Auto-Cypher: Improving LLMs on Cypher generation via LLM-supervised generation-verification framework</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Tiwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiva</namePart>
<namePart type="given">Krishna</namePart>
<namePart type="given">Reddy</namePart>
<namePart type="family">Malay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vikas</namePart>
<namePart type="family">Yadav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masoud</namePart>
<namePart type="family">Hashemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sathwik</namePart>
<namePart type="given">Tejaswi</namePart>
<namePart type="family">Madhusudhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-190-2</identifier>
</relatedItem>
<abstract>Graph databases like Neo4j are gaining popularity for handling complex, interconnected data, over traditional relational databases in modeling and querying relationships. While translating natural language into SQL queries is well-researched, generating Cypher queries for Neo4j remains relatively underexplored. In this work, we present an automated, LLM Supervised, pipeline to generate high quality synthetic data for Text2Cypher. Our Cypher data generation pipeline introduces LLM-As-Database-Filler, a novel strategy for ensuring Cypher query correctness, thus resulting in high quality generations. Using our pipeline, we generate high quality Text2Cypher data - SynthCypher containing 29.8k instances across various domains and queries with varying complexities. Training open-source LLMs like LLaMa-3.1-8B, Mistral-7B, and QWEN7B on SynthCypher results in performance gains of up to 40% on the Text2Cypher test split and 30% on the SPIDER benchmark, adapted for graph databases.</abstract>
<identifier type="citekey">tiwari-etal-2025-auto</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-short.53</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-short.53/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>623</start>
<end>640</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Auto-Cypher: Improving LLMs on Cypher generation via LLM-supervised generation-verification framework
%A Tiwari, Aman
%A Malay, Shiva Krishna Reddy
%A Yadav, Vikas
%A Hashemi, Masoud
%A Madhusudhan, Sathwik Tejaswi
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-190-2
%F tiwari-etal-2025-auto
%X Graph databases like Neo4j are gaining popularity for handling complex, interconnected data, over traditional relational databases in modeling and querying relationships. While translating natural language into SQL queries is well-researched, generating Cypher queries for Neo4j remains relatively underexplored. In this work, we present an automated, LLM Supervised, pipeline to generate high quality synthetic data for Text2Cypher. Our Cypher data generation pipeline introduces LLM-As-Database-Filler, a novel strategy for ensuring Cypher query correctness, thus resulting in high quality generations. Using our pipeline, we generate high quality Text2Cypher data - SynthCypher containing 29.8k instances across various domains and queries with varying complexities. Training open-source LLMs like LLaMa-3.1-8B, Mistral-7B, and QWEN7B on SynthCypher results in performance gains of up to 40% on the Text2Cypher test split and 30% on the SPIDER benchmark, adapted for graph databases.
%R 10.18653/v1/2025.naacl-short.53
%U https://aclanthology.org/2025.naacl-short.53/
%U https://doi.org/10.18653/v1/2025.naacl-short.53
%P 623-640
Markdown (Informal)
[Auto-Cypher: Improving LLMs on Cypher generation via LLM-supervised generation-verification framework](https://aclanthology.org/2025.naacl-short.53/) (Tiwari et al., NAACL 2025)
ACL