@inproceedings{meyur-etal-2025-weqa,
title = "{W}e{QA}: A Benchmark for Retrieval Augmented Generation in Wind Energy Domain",
author = "Meyur, Rounak and
Phan, Hung and
Wagle, Sridevi and
Strube, Jan and
Halappanavar, Mahantesh and
Horawalavithana, Sameera and
Acharya, Anurag and
Munikoti, Sai",
editor = "Atwell, Katherine and
Biester, Laura and
Borah, Angana and
Dementieva, Daryna and
Ignat, Oana and
Kotonya, Neema and
Liu, Ziyi and
Wan, Ruyuan and
Wilson, Steven and
Zhao, Jieyu",
booktitle = "Proceedings of the Fourth Workshop on NLP for Positive Impact (NLP4PI)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nlp4pi-1.20/",
doi = "10.18653/v1/2025.nlp4pi-1.20",
pages = "239--251",
ISBN = "978-1-959429-19-7",
abstract = "Wind energy project assessments present significant challenges for decision-makers, who must navigate and synthesize hundreds of pages of environmental and scientific documentation. These documents often span different regions and project scales, covering multiple domains of expertise. This process traditionally demands immense time and specialized knowledge from decision-makers. The advent of Large Language Models (LLM) and Retrieval Augmented Generation (RAG) approaches offer a transformative solution, enabling rapid, accurate cross-document information retrieval and synthesis. As the landscape of Natural Language Processing (NLP) and text generation continues to evolve, benchmarking becomes essential to evaluate and compare the performance of different RAG-based LLMs. In this paper, we present a comprehensive framework to generate a domain relevant RAG benchmark. Our framework is based on automatic question-answer generation with Human (domain experts)-AI (LLM) teaming. As a case study, we demonstrate the framework by introducing WeQA, a first-of-its-kind benchmark on the wind energy domain which comprises of multiple scientific documents/reports related to environmental aspects of wind energy projects. Our framework systematically evaluates RAG performance using diverse metrics and multiple question types with varying complexity level, providing a foundation for rigorous assessment of RAG-based systems in complex scientific domains and enabling researchers to identify areas for improvement in domain-specific applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="meyur-etal-2025-weqa">
<titleInfo>
<title>WeQA: A Benchmark for Retrieval Augmented Generation in Wind Energy Domain</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rounak</namePart>
<namePart type="family">Meyur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung</namePart>
<namePart type="family">Phan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sridevi</namePart>
<namePart type="family">Wagle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Strube</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahantesh</namePart>
<namePart type="family">Halappanavar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sameera</namePart>
<namePart type="family">Horawalavithana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anurag</namePart>
<namePart type="family">Acharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="family">Munikoti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on NLP for Positive Impact (NLP4PI)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katherine</namePart>
<namePart type="family">Atwell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Biester</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angana</namePart>
<namePart type="family">Borah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daryna</namePart>
<namePart type="family">Dementieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Ignat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neema</namePart>
<namePart type="family">Kotonya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruyuan</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jieyu</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-19-7</identifier>
</relatedItem>
<abstract>Wind energy project assessments present significant challenges for decision-makers, who must navigate and synthesize hundreds of pages of environmental and scientific documentation. These documents often span different regions and project scales, covering multiple domains of expertise. This process traditionally demands immense time and specialized knowledge from decision-makers. The advent of Large Language Models (LLM) and Retrieval Augmented Generation (RAG) approaches offer a transformative solution, enabling rapid, accurate cross-document information retrieval and synthesis. As the landscape of Natural Language Processing (NLP) and text generation continues to evolve, benchmarking becomes essential to evaluate and compare the performance of different RAG-based LLMs. In this paper, we present a comprehensive framework to generate a domain relevant RAG benchmark. Our framework is based on automatic question-answer generation with Human (domain experts)-AI (LLM) teaming. As a case study, we demonstrate the framework by introducing WeQA, a first-of-its-kind benchmark on the wind energy domain which comprises of multiple scientific documents/reports related to environmental aspects of wind energy projects. Our framework systematically evaluates RAG performance using diverse metrics and multiple question types with varying complexity level, providing a foundation for rigorous assessment of RAG-based systems in complex scientific domains and enabling researchers to identify areas for improvement in domain-specific applications.</abstract>
<identifier type="citekey">meyur-etal-2025-weqa</identifier>
<identifier type="doi">10.18653/v1/2025.nlp4pi-1.20</identifier>
<location>
<url>https://aclanthology.org/2025.nlp4pi-1.20/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>239</start>
<end>251</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WeQA: A Benchmark for Retrieval Augmented Generation in Wind Energy Domain
%A Meyur, Rounak
%A Phan, Hung
%A Wagle, Sridevi
%A Strube, Jan
%A Halappanavar, Mahantesh
%A Horawalavithana, Sameera
%A Acharya, Anurag
%A Munikoti, Sai
%Y Atwell, Katherine
%Y Biester, Laura
%Y Borah, Angana
%Y Dementieva, Daryna
%Y Ignat, Oana
%Y Kotonya, Neema
%Y Liu, Ziyi
%Y Wan, Ruyuan
%Y Wilson, Steven
%Y Zhao, Jieyu
%S Proceedings of the Fourth Workshop on NLP for Positive Impact (NLP4PI)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-19-7
%F meyur-etal-2025-weqa
%X Wind energy project assessments present significant challenges for decision-makers, who must navigate and synthesize hundreds of pages of environmental and scientific documentation. These documents often span different regions and project scales, covering multiple domains of expertise. This process traditionally demands immense time and specialized knowledge from decision-makers. The advent of Large Language Models (LLM) and Retrieval Augmented Generation (RAG) approaches offer a transformative solution, enabling rapid, accurate cross-document information retrieval and synthesis. As the landscape of Natural Language Processing (NLP) and text generation continues to evolve, benchmarking becomes essential to evaluate and compare the performance of different RAG-based LLMs. In this paper, we present a comprehensive framework to generate a domain relevant RAG benchmark. Our framework is based on automatic question-answer generation with Human (domain experts)-AI (LLM) teaming. As a case study, we demonstrate the framework by introducing WeQA, a first-of-its-kind benchmark on the wind energy domain which comprises of multiple scientific documents/reports related to environmental aspects of wind energy projects. Our framework systematically evaluates RAG performance using diverse metrics and multiple question types with varying complexity level, providing a foundation for rigorous assessment of RAG-based systems in complex scientific domains and enabling researchers to identify areas for improvement in domain-specific applications.
%R 10.18653/v1/2025.nlp4pi-1.20
%U https://aclanthology.org/2025.nlp4pi-1.20/
%U https://doi.org/10.18653/v1/2025.nlp4pi-1.20
%P 239-251
Markdown (Informal)
[WeQA: A Benchmark for Retrieval Augmented Generation in Wind Energy Domain](https://aclanthology.org/2025.nlp4pi-1.20/) (Meyur et al., NLP4PI 2025)
ACL
- Rounak Meyur, Hung Phan, Sridevi Wagle, Jan Strube, Mahantesh Halappanavar, Sameera Horawalavithana, Anurag Acharya, and Sai Munikoti. 2025. WeQA: A Benchmark for Retrieval Augmented Generation in Wind Energy Domain. In Proceedings of the Fourth Workshop on NLP for Positive Impact (NLP4PI), pages 239–251, Vienna, Austria. Association for Computational Linguistics.