@inproceedings{pronk-etal-2025-bluetoad,
title = "{B}lue{T}oad at {S}em{E}val-2025 Task 3: Using Question-Answering-Based Language Models to Extract Hallucinations from Machine-Generated Text",
author = "Pronk, Michiel and
Kamyshanova, Ekaterina and
Adam, Thijmen and
Van Der Maesen De Sombreff, Maxim",
editor = "Rosenthal, Sara and
Ros{\'a}, Aiala and
Ghosh, Debanjan and
Zampieri, Marcos",
booktitle = "Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.semeval-1.95/",
pages = "684--694",
ISBN = "979-8-89176-273-2",
abstract = "Hallucination in machine-generated text poses big risks in various domains, such as finance, medicine, and engineering. Task 3 of SemEval-2025, Mu-SHROOM, challenges participants to detect hallucinated spans in such text. Our approach uses pre-trained language models and fine-tuning strategies to enhance hallucination spam detection, focusing on the English track. Firstly, we applied GPT-4o mini to generate synthetic data by labeling unlabeled data. Then, we employed encoder-only pre-trained language models with a question-answering architecture for hallucination span detection, ultimately choosing XLM-RoBERTa for fine-tuning on multilingual data. This model appeared to be our best and ranked 18th and 22nd on the English track with 0.469 intersection-over-union and 0.441 correlation scores, respectively. It achieved promising results across multiple languages, surpassing baseline methods in 11 out of 13 languages, with Hindi having the highest scores of 0.645 intersection-over-union and 0.684 correlation coefficient. Our findings highlight the potential of a QA approach and using synthetic and multilingual data for hallucination span detection."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pronk-etal-2025-bluetoad">
<titleInfo>
<title>BlueToad at SemEval-2025 Task 3: Using Question-Answering-Based Language Models to Extract Hallucinations from Machine-Generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michiel</namePart>
<namePart type="family">Pronk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kamyshanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thijmen</namePart>
<namePart type="family">Adam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Van Der Maesen De Sombreff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Rosenthal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiala</namePart>
<namePart type="family">Rosá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-273-2</identifier>
</relatedItem>
<abstract>Hallucination in machine-generated text poses big risks in various domains, such as finance, medicine, and engineering. Task 3 of SemEval-2025, Mu-SHROOM, challenges participants to detect hallucinated spans in such text. Our approach uses pre-trained language models and fine-tuning strategies to enhance hallucination spam detection, focusing on the English track. Firstly, we applied GPT-4o mini to generate synthetic data by labeling unlabeled data. Then, we employed encoder-only pre-trained language models with a question-answering architecture for hallucination span detection, ultimately choosing XLM-RoBERTa for fine-tuning on multilingual data. This model appeared to be our best and ranked 18th and 22nd on the English track with 0.469 intersection-over-union and 0.441 correlation scores, respectively. It achieved promising results across multiple languages, surpassing baseline methods in 11 out of 13 languages, with Hindi having the highest scores of 0.645 intersection-over-union and 0.684 correlation coefficient. Our findings highlight the potential of a QA approach and using synthetic and multilingual data for hallucination span detection.</abstract>
<identifier type="citekey">pronk-etal-2025-bluetoad</identifier>
<location>
<url>https://aclanthology.org/2025.semeval-1.95/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>684</start>
<end>694</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BlueToad at SemEval-2025 Task 3: Using Question-Answering-Based Language Models to Extract Hallucinations from Machine-Generated Text
%A Pronk, Michiel
%A Kamyshanova, Ekaterina
%A Adam, Thijmen
%A Van Der Maesen De Sombreff, Maxim
%Y Rosenthal, Sara
%Y Rosá, Aiala
%Y Ghosh, Debanjan
%Y Zampieri, Marcos
%S Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-273-2
%F pronk-etal-2025-bluetoad
%X Hallucination in machine-generated text poses big risks in various domains, such as finance, medicine, and engineering. Task 3 of SemEval-2025, Mu-SHROOM, challenges participants to detect hallucinated spans in such text. Our approach uses pre-trained language models and fine-tuning strategies to enhance hallucination spam detection, focusing on the English track. Firstly, we applied GPT-4o mini to generate synthetic data by labeling unlabeled data. Then, we employed encoder-only pre-trained language models with a question-answering architecture for hallucination span detection, ultimately choosing XLM-RoBERTa for fine-tuning on multilingual data. This model appeared to be our best and ranked 18th and 22nd on the English track with 0.469 intersection-over-union and 0.441 correlation scores, respectively. It achieved promising results across multiple languages, surpassing baseline methods in 11 out of 13 languages, with Hindi having the highest scores of 0.645 intersection-over-union and 0.684 correlation coefficient. Our findings highlight the potential of a QA approach and using synthetic and multilingual data for hallucination span detection.
%U https://aclanthology.org/2025.semeval-1.95/
%P 684-694
Markdown (Informal)
[BlueToad at SemEval-2025 Task 3: Using Question-Answering-Based Language Models to Extract Hallucinations from Machine-Generated Text](https://aclanthology.org/2025.semeval-1.95/) (Pronk et al., SemEval 2025)
ACL