@inproceedings{malaguti-etal-2025-orqa,
title = "{O}r{QA} {--} Open Data Retrieval for Question Answering dataset generation",
author = "Malaguti, Giovanni and
Mozzillo, Angelo and
Simonini, Giovanni",
editor = "Chang, Shuaichen and
Hulsebos, Madelon and
Liu, Qian and
Chen, Wenhu and
Sun, Huan",
booktitle = "Proceedings of the 4th Table Representation Learning Workshop",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.trl-1.14/",
doi = "10.18653/v1/2025.trl-1.14",
pages = "172--181",
ISBN = "979-8-89176-268-8",
abstract = "We present OrQA, a novel agentic framework to generate large-scale tabular question-answering (TQA) datasets based on real-world open data.Such datasets are needed to overcome the limitations of existing benchmark datasets, which rely on synthetic questions or limited web tables.OrQA employs LLM agents to retrieve related open data tables, generate natural questions, and synthesize executable $\texttt{SQL}$ queries{---}involving joins, unions, and other non-trivial operations.By leveraging hundreds of GPU hours on four NVIDIA A100, we applied OrQA to Canadian and UK government open data to produce 1,000 question-tables{--}SQL triples, a representative sample of which has been human{-}validated.This open{-}source dataset is now publicly available to drive transparency, reproducibility, and progress in table{-}based question answering."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="malaguti-etal-2025-orqa">
<titleInfo>
<title>OrQA – Open Data Retrieval for Question Answering dataset generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Malaguti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angelo</namePart>
<namePart type="family">Mozzillo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Simonini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Table Representation Learning Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shuaichen</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Madelon</namePart>
<namePart type="family">Hulsebos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-268-8</identifier>
</relatedItem>
<abstract>We present OrQA, a novel agentic framework to generate large-scale tabular question-answering (TQA) datasets based on real-world open data.Such datasets are needed to overcome the limitations of existing benchmark datasets, which rely on synthetic questions or limited web tables.OrQA employs LLM agents to retrieve related open data tables, generate natural questions, and synthesize executable SQL queries—involving joins, unions, and other non-trivial operations.By leveraging hundreds of GPU hours on four NVIDIA A100, we applied OrQA to Canadian and UK government open data to produce 1,000 question-tables–SQL triples, a representative sample of which has been human-validated.This open-source dataset is now publicly available to drive transparency, reproducibility, and progress in table-based question answering.</abstract>
<identifier type="citekey">malaguti-etal-2025-orqa</identifier>
<identifier type="doi">10.18653/v1/2025.trl-1.14</identifier>
<location>
<url>https://aclanthology.org/2025.trl-1.14/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>172</start>
<end>181</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OrQA – Open Data Retrieval for Question Answering dataset generation
%A Malaguti, Giovanni
%A Mozzillo, Angelo
%A Simonini, Giovanni
%Y Chang, Shuaichen
%Y Hulsebos, Madelon
%Y Liu, Qian
%Y Chen, Wenhu
%Y Sun, Huan
%S Proceedings of the 4th Table Representation Learning Workshop
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-268-8
%F malaguti-etal-2025-orqa
%X We present OrQA, a novel agentic framework to generate large-scale tabular question-answering (TQA) datasets based on real-world open data.Such datasets are needed to overcome the limitations of existing benchmark datasets, which rely on synthetic questions or limited web tables.OrQA employs LLM agents to retrieve related open data tables, generate natural questions, and synthesize executable SQL queries—involving joins, unions, and other non-trivial operations.By leveraging hundreds of GPU hours on four NVIDIA A100, we applied OrQA to Canadian and UK government open data to produce 1,000 question-tables–SQL triples, a representative sample of which has been human-validated.This open-source dataset is now publicly available to drive transparency, reproducibility, and progress in table-based question answering.
%R 10.18653/v1/2025.trl-1.14
%U https://aclanthology.org/2025.trl-1.14/
%U https://doi.org/10.18653/v1/2025.trl-1.14
%P 172-181
Markdown (Informal)
[OrQA – Open Data Retrieval for Question Answering dataset generation](https://aclanthology.org/2025.trl-1.14/) (Malaguti et al., TRL 2025)
ACL