@inproceedings{gonzalez-torres-etal-2024-automated,
title = "Automated Question-Answer Generation for Evaluating {RAG}-based Chatbots",
author = "Gonz{\'a}lez Torres, Juan Jos{\'e} and
B{\^\i}ndil{\u{a}}, Mihai Bogdan and
Hofstee, Sebastiaan and
Szondy, Daniel and
Nguyen, Quang-Hung and
Wang, Shenghui and
Englebienne, Gwenn",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Thompson, Paul and
Ondov, Brian",
booktitle = "Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.cl4health-1.25",
pages = "204--214",
abstract = "In this research, we propose a framework to generate human-like question-answer pairs with long or factoid answers automatically and, based on them, automatically evaluate the quality of Retrieval-Augmented Generation (RAG). Our framework can also create datasets that assess hallucination levels of Large Language Models (LLMs) by simulating unanswerable questions. We then apply the framework to create a dataset of question-answer (QA) pairs based on more than 1,000 leaflets about the medical and administrative procedures of a hospital. The dataset was evaluated by hospital specialists, who confirmed that more than 50{\%} of the QA pairs are applicable. Finally, we show that our framework can be used to evaluate LLM performance by using Llama-2-13B fine-tuned in Dutch (Vanroy, 2023) with the generated dataset, and show the method{'}s use in testing models with regard to answering unanswerable and factoid questions appears promising.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gonzalez-torres-etal-2024-automated">
<titleInfo>
<title>Automated Question-Answer Generation for Evaluating RAG-based Chatbots</title>
</titleInfo>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">José</namePart>
<namePart type="family">González Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihai</namePart>
<namePart type="given">Bogdan</namePart>
<namePart type="family">Bîndilă</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastiaan</namePart>
<namePart type="family">Hofstee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Szondy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quang-Hung</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shenghui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gwenn</namePart>
<namePart type="family">Englebienne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Ondov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this research, we propose a framework to generate human-like question-answer pairs with long or factoid answers automatically and, based on them, automatically evaluate the quality of Retrieval-Augmented Generation (RAG). Our framework can also create datasets that assess hallucination levels of Large Language Models (LLMs) by simulating unanswerable questions. We then apply the framework to create a dataset of question-answer (QA) pairs based on more than 1,000 leaflets about the medical and administrative procedures of a hospital. The dataset was evaluated by hospital specialists, who confirmed that more than 50% of the QA pairs are applicable. Finally, we show that our framework can be used to evaluate LLM performance by using Llama-2-13B fine-tuned in Dutch (Vanroy, 2023) with the generated dataset, and show the method’s use in testing models with regard to answering unanswerable and factoid questions appears promising.</abstract>
<identifier type="citekey">gonzalez-torres-etal-2024-automated</identifier>
<location>
<url>https://aclanthology.org/2024.cl4health-1.25</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>204</start>
<end>214</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automated Question-Answer Generation for Evaluating RAG-based Chatbots
%A González Torres, Juan José
%A Bîndilă, Mihai Bogdan
%A Hofstee, Sebastiaan
%A Szondy, Daniel
%A Nguyen, Quang-Hung
%A Wang, Shenghui
%A Englebienne, Gwenn
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Thompson, Paul
%Y Ondov, Brian
%S Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F gonzalez-torres-etal-2024-automated
%X In this research, we propose a framework to generate human-like question-answer pairs with long or factoid answers automatically and, based on them, automatically evaluate the quality of Retrieval-Augmented Generation (RAG). Our framework can also create datasets that assess hallucination levels of Large Language Models (LLMs) by simulating unanswerable questions. We then apply the framework to create a dataset of question-answer (QA) pairs based on more than 1,000 leaflets about the medical and administrative procedures of a hospital. The dataset was evaluated by hospital specialists, who confirmed that more than 50% of the QA pairs are applicable. Finally, we show that our framework can be used to evaluate LLM performance by using Llama-2-13B fine-tuned in Dutch (Vanroy, 2023) with the generated dataset, and show the method’s use in testing models with regard to answering unanswerable and factoid questions appears promising.
%U https://aclanthology.org/2024.cl4health-1.25
%P 204-214
Markdown (Informal)
[Automated Question-Answer Generation for Evaluating RAG-based Chatbots](https://aclanthology.org/2024.cl4health-1.25) (González Torres et al., CL4Health-WS 2024)
ACL
- Juan José González Torres, Mihai Bogdan Bîndilă, Sebastiaan Hofstee, Daniel Szondy, Quang-Hung Nguyen, Shenghui Wang, and Gwenn Englebienne. 2024. Automated Question-Answer Generation for Evaluating RAG-based Chatbots. In Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024, pages 204–214, Torino, Italia. ELRA and ICCL.