@inproceedings{bala-etal-2025-hausanlp,
title = "{H}ausa{NLP} at {S}em{E}val-2025 Task 3: Towards a Fine-Grained Model-Aware Hallucination Detection",
author = "Bala, Maryam and
Abubakar, Amina and
Abubakar, Abdulhamid and
Bichi, Abdulkadir and
Ahmad, Hafsa and
Sani, Sani Abdullahi and
Abdulmumin, Idris and
Muhammad, Shamsuddeen Hassan and
Ahmad, Ibrahim Said",
editor = "Rosenthal, Sara and
Ros{\'a}, Aiala and
Ghosh, Debanjan and
Zampieri, Marcos",
booktitle = "Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.semeval-1.227/",
pages = "1737--1741",
ISBN = "979-8-89176-273-2",
abstract = "This paper presents our findings of the Multilingual Shared Task on Hallucinations and Related Observable Overgeneration Mistakes, MU-SHROOM, which focuses on identifying hallucinations and related overgeneration errors in large language models (LLMs). The shared task involves detecting specific text spans that constitute hallucinations in the outputs generated by LLMs in 14 languages. To address this task, we aim to provide a nuanced, model-aware understanding of hallucination occurrences and severity in English. We used natural language inference and fine-tuned a ModernBERT model using a synthetic dataset of 400 samples, achieving an Intersection over Union (IoU) score of 0.032 and a correlation score of 0.422. These results indicate a moderately positive correlation between the model{'}s confidence scores and the actual presence of hallucinations. The IoU score indicates that our modelhas a relatively low overlap between the predicted hallucination span and the truth annotation. The performance is unsurprising, given the intricate nature of hallucination detection. Hallucinations often manifest subtly, relying on context, making pinpointing their exact boundaries formidable."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bala-etal-2025-hausanlp">
<titleInfo>
<title>HausaNLP at SemEval-2025 Task 3: Towards a Fine-Grained Model-Aware Hallucination Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maryam</namePart>
<namePart type="family">Bala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amina</namePart>
<namePart type="family">Abubakar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdulhamid</namePart>
<namePart type="family">Abubakar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdulkadir</namePart>
<namePart type="family">Bichi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hafsa</namePart>
<namePart type="family">Ahmad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sani</namePart>
<namePart type="given">Abdullahi</namePart>
<namePart type="family">Sani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shamsuddeen</namePart>
<namePart type="given">Hassan</namePart>
<namePart type="family">Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="given">Said</namePart>
<namePart type="family">Ahmad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Rosenthal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiala</namePart>
<namePart type="family">Rosá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-273-2</identifier>
</relatedItem>
<abstract>This paper presents our findings of the Multilingual Shared Task on Hallucinations and Related Observable Overgeneration Mistakes, MU-SHROOM, which focuses on identifying hallucinations and related overgeneration errors in large language models (LLMs). The shared task involves detecting specific text spans that constitute hallucinations in the outputs generated by LLMs in 14 languages. To address this task, we aim to provide a nuanced, model-aware understanding of hallucination occurrences and severity in English. We used natural language inference and fine-tuned a ModernBERT model using a synthetic dataset of 400 samples, achieving an Intersection over Union (IoU) score of 0.032 and a correlation score of 0.422. These results indicate a moderately positive correlation between the model’s confidence scores and the actual presence of hallucinations. The IoU score indicates that our modelhas a relatively low overlap between the predicted hallucination span and the truth annotation. The performance is unsurprising, given the intricate nature of hallucination detection. Hallucinations often manifest subtly, relying on context, making pinpointing their exact boundaries formidable.</abstract>
<identifier type="citekey">bala-etal-2025-hausanlp</identifier>
<location>
<url>https://aclanthology.org/2025.semeval-1.227/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1737</start>
<end>1741</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HausaNLP at SemEval-2025 Task 3: Towards a Fine-Grained Model-Aware Hallucination Detection
%A Bala, Maryam
%A Abubakar, Amina
%A Abubakar, Abdulhamid
%A Bichi, Abdulkadir
%A Ahmad, Hafsa
%A Sani, Sani Abdullahi
%A Abdulmumin, Idris
%A Muhammad, Shamsuddeen Hassan
%A Ahmad, Ibrahim Said
%Y Rosenthal, Sara
%Y Rosá, Aiala
%Y Ghosh, Debanjan
%Y Zampieri, Marcos
%S Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-273-2
%F bala-etal-2025-hausanlp
%X This paper presents our findings of the Multilingual Shared Task on Hallucinations and Related Observable Overgeneration Mistakes, MU-SHROOM, which focuses on identifying hallucinations and related overgeneration errors in large language models (LLMs). The shared task involves detecting specific text spans that constitute hallucinations in the outputs generated by LLMs in 14 languages. To address this task, we aim to provide a nuanced, model-aware understanding of hallucination occurrences and severity in English. We used natural language inference and fine-tuned a ModernBERT model using a synthetic dataset of 400 samples, achieving an Intersection over Union (IoU) score of 0.032 and a correlation score of 0.422. These results indicate a moderately positive correlation between the model’s confidence scores and the actual presence of hallucinations. The IoU score indicates that our modelhas a relatively low overlap between the predicted hallucination span and the truth annotation. The performance is unsurprising, given the intricate nature of hallucination detection. Hallucinations often manifest subtly, relying on context, making pinpointing their exact boundaries formidable.
%U https://aclanthology.org/2025.semeval-1.227/
%P 1737-1741
Markdown (Informal)
[HausaNLP at SemEval-2025 Task 3: Towards a Fine-Grained Model-Aware Hallucination Detection](https://aclanthology.org/2025.semeval-1.227/) (Bala et al., SemEval 2025)
ACL
- Maryam Bala, Amina Abubakar, Abdulhamid Abubakar, Abdulkadir Bichi, Hafsa Ahmad, Sani Abdullahi Sani, Idris Abdulmumin, Shamsuddeen Hassan Muhammad, and Ibrahim Said Ahmad. 2025. HausaNLP at SemEval-2025 Task 3: Towards a Fine-Grained Model-Aware Hallucination Detection. In Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025), pages 1737–1741, Vienna, Austria. Association for Computational Linguistics.