@inproceedings{harsha-etal-2025-synthetic,
title = "Synthetic Data Generation Using Large Language Models for Financial Question Answering",
author = "Harsha, Chetan and
Phogat, Karmvir Singh and
Dasaratha, Sridhar and
Puranam, Sai Akhil and
Ramakrishna, Shashishekar",
editor = "Chen, Chung-Chi and
Moreno-Sandoval, Antonio and
Huang, Jimin and
Xie, Qianqian and
Ananiadou, Sophia and
Chen, Hsin-Hsi",
booktitle = "Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.finnlp-1.7/",
pages = "76--95",
abstract = "Recent research has shown excellent performance of large language models (LLMs) for answering questions requiring multi-step financial reasoning. While the larger models have been used with zero-shot or few-shot prompting, the smaller variants need fine-tuning on training data containing questions and the corresponding answers that includes detailed reasoning demonstrations. To alleviate the significant cost of creating a data set with complex questions and corresponding answers, we explore the use of synthetic data for financial question answering using a multi-step LLM based approach to generate question as well as the answers with reasoning steps. We consider standard as well as conversational financial question answering scenarios. We experiment with synthetic data generation for three different real financial reasoning problems that already have manually collected data sets created with the help of financial experts. Using the same document sources, we use the proposed LLM based approach to generate synthetic questions and answers. To measure the effectiveness, we train multiple small language models (SLMs) on these synthetic data and compare the performance with that of the same SLMs trained on the real data. We further perform extensive experimental analysis generating important evidence on the potential of using synthetic data in financial reasoning tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="harsha-etal-2025-synthetic">
<titleInfo>
<title>Synthetic Data Generation Using Large Language Models for Financial Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chetan</namePart>
<namePart type="family">Harsha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karmvir</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Phogat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sridhar</namePart>
<namePart type="family">Dasaratha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Akhil</namePart>
<namePart type="family">Puranam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashishekar</namePart>
<namePart type="family">Ramakrishna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chung-Chi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Moreno-Sandoval</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qianqian</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent research has shown excellent performance of large language models (LLMs) for answering questions requiring multi-step financial reasoning. While the larger models have been used with zero-shot or few-shot prompting, the smaller variants need fine-tuning on training data containing questions and the corresponding answers that includes detailed reasoning demonstrations. To alleviate the significant cost of creating a data set with complex questions and corresponding answers, we explore the use of synthetic data for financial question answering using a multi-step LLM based approach to generate question as well as the answers with reasoning steps. We consider standard as well as conversational financial question answering scenarios. We experiment with synthetic data generation for three different real financial reasoning problems that already have manually collected data sets created with the help of financial experts. Using the same document sources, we use the proposed LLM based approach to generate synthetic questions and answers. To measure the effectiveness, we train multiple small language models (SLMs) on these synthetic data and compare the performance with that of the same SLMs trained on the real data. We further perform extensive experimental analysis generating important evidence on the potential of using synthetic data in financial reasoning tasks.</abstract>
<identifier type="citekey">harsha-etal-2025-synthetic</identifier>
<location>
<url>https://aclanthology.org/2025.finnlp-1.7/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>76</start>
<end>95</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Data Generation Using Large Language Models for Financial Question Answering
%A Harsha, Chetan
%A Phogat, Karmvir Singh
%A Dasaratha, Sridhar
%A Puranam, Sai Akhil
%A Ramakrishna, Shashishekar
%Y Chen, Chung-Chi
%Y Moreno-Sandoval, Antonio
%Y Huang, Jimin
%Y Xie, Qianqian
%Y Ananiadou, Sophia
%Y Chen, Hsin-Hsi
%S Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F harsha-etal-2025-synthetic
%X Recent research has shown excellent performance of large language models (LLMs) for answering questions requiring multi-step financial reasoning. While the larger models have been used with zero-shot or few-shot prompting, the smaller variants need fine-tuning on training data containing questions and the corresponding answers that includes detailed reasoning demonstrations. To alleviate the significant cost of creating a data set with complex questions and corresponding answers, we explore the use of synthetic data for financial question answering using a multi-step LLM based approach to generate question as well as the answers with reasoning steps. We consider standard as well as conversational financial question answering scenarios. We experiment with synthetic data generation for three different real financial reasoning problems that already have manually collected data sets created with the help of financial experts. Using the same document sources, we use the proposed LLM based approach to generate synthetic questions and answers. To measure the effectiveness, we train multiple small language models (SLMs) on these synthetic data and compare the performance with that of the same SLMs trained on the real data. We further perform extensive experimental analysis generating important evidence on the potential of using synthetic data in financial reasoning tasks.
%U https://aclanthology.org/2025.finnlp-1.7/
%P 76-95
Markdown (Informal)
[Synthetic Data Generation Using Large Language Models for Financial Question Answering](https://aclanthology.org/2025.finnlp-1.7/) (Harsha et al., FinNLP 2025)
ACL
- Chetan Harsha, Karmvir Singh Phogat, Sridhar Dasaratha, Sai Akhil Puranam, and Shashishekar Ramakrishna. 2025. Synthetic Data Generation Using Large Language Models for Financial Question Answering. In Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal), pages 76–95, Abu Dhabi, UAE. Association for Computational Linguistics.