@inproceedings{ionov-etal-2025-smurfcat,
title = "{S}murf{C}at at {SHROOM}-{CAP}: Factual but Awkward? Fluent but Wrong? Tackling Both in {LLM} Scientific {QA}",
author = "Ionov, Timur and
Nikolaev, Evgenii and
Vazhentsev, Artem and
Chaichuk, Mikhail and
Korznikov, Anton and
Tutubalina, Elena and
Panchenko, Alexander and
Konovalov, Vasily and
Rykov, Elisei",
editor = {Sinha, Aman and
V{\'a}zquez, Ra{\'u}l and
Mickus, Timothee and
Agarwal, Rohit and
Buhnila, Ioana and
Schmidtov{\'a}, Patr{\'i}cia and
Gamba, Federica and
Prasad, Dilip K. and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 1st Workshop on Confabulation, Hallucinations and Overgeneration in Multilingual and Practical Settings (CHOMPS 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.chomps-main.8/",
pages = "81--89",
ISBN = "979-8-89176-308-1",
abstract = "Large Language Models (LLMs) often generate hallucinations, a critical issue in domains like scientific communication where factual accuracy and fluency are essential. The SHROOM-CAP shared task addresses this challenge by evaluating Factual Mistakes and Fluency Mistakes across diverse languages, extending earlier SHROOM editions to the scientific domain. We present Smurfcat, our system for SHROOM-CAP, which integrates three complementary approaches: uncertainty estimation (white-box and black-box signals), encoder-based classifiers (Multilingual Modern BERT), and decoder-based judges (instruction-tuned LLMs with classification heads). Results show that decoder-based judges achieve the strongest overall performance, while uncertainty methods and encoders provide complementary strengths. Our findings highlight the value of combining uncertainty signals with encoder and decoder architectures for robust, multilingual detection of hallucinations and related errors in scientific publications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ionov-etal-2025-smurfcat">
<titleInfo>
<title>SmurfCat at SHROOM-CAP: Factual but Awkward? Fluent but Wrong? Tackling Both in LLM Scientific QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timur</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evgenii</namePart>
<namePart type="family">Nikolaev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Vazhentsev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikhail</namePart>
<namePart type="family">Chaichuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Korznikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Tutubalina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Panchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasily</namePart>
<namePart type="family">Konovalov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisei</namePart>
<namePart type="family">Rykov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Confabulation, Hallucinations and Overgeneration in Multilingual and Practical Settings (CHOMPS 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Sinha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raúl</namePart>
<namePart type="family">Vázquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothee</namePart>
<namePart type="family">Mickus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohit</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioana</namePart>
<namePart type="family">Buhnila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federica</namePart>
<namePart type="family">Gamba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilip</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Prasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-308-1</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) often generate hallucinations, a critical issue in domains like scientific communication where factual accuracy and fluency are essential. The SHROOM-CAP shared task addresses this challenge by evaluating Factual Mistakes and Fluency Mistakes across diverse languages, extending earlier SHROOM editions to the scientific domain. We present Smurfcat, our system for SHROOM-CAP, which integrates three complementary approaches: uncertainty estimation (white-box and black-box signals), encoder-based classifiers (Multilingual Modern BERT), and decoder-based judges (instruction-tuned LLMs with classification heads). Results show that decoder-based judges achieve the strongest overall performance, while uncertainty methods and encoders provide complementary strengths. Our findings highlight the value of combining uncertainty signals with encoder and decoder architectures for robust, multilingual detection of hallucinations and related errors in scientific publications.</abstract>
<identifier type="citekey">ionov-etal-2025-smurfcat</identifier>
<location>
<url>https://aclanthology.org/2025.chomps-main.8/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>81</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SmurfCat at SHROOM-CAP: Factual but Awkward? Fluent but Wrong? Tackling Both in LLM Scientific QA
%A Ionov, Timur
%A Nikolaev, Evgenii
%A Vazhentsev, Artem
%A Chaichuk, Mikhail
%A Korznikov, Anton
%A Tutubalina, Elena
%A Panchenko, Alexander
%A Konovalov, Vasily
%A Rykov, Elisei
%Y Sinha, Aman
%Y Vázquez, Raúl
%Y Mickus, Timothee
%Y Agarwal, Rohit
%Y Buhnila, Ioana
%Y Schmidtová, Patrícia
%Y Gamba, Federica
%Y Prasad, Dilip K.
%Y Tiedemann, Jörg
%S Proceedings of the 1st Workshop on Confabulation, Hallucinations and Overgeneration in Multilingual and Practical Settings (CHOMPS 2025)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-308-1
%F ionov-etal-2025-smurfcat
%X Large Language Models (LLMs) often generate hallucinations, a critical issue in domains like scientific communication where factual accuracy and fluency are essential. The SHROOM-CAP shared task addresses this challenge by evaluating Factual Mistakes and Fluency Mistakes across diverse languages, extending earlier SHROOM editions to the scientific domain. We present Smurfcat, our system for SHROOM-CAP, which integrates three complementary approaches: uncertainty estimation (white-box and black-box signals), encoder-based classifiers (Multilingual Modern BERT), and decoder-based judges (instruction-tuned LLMs with classification heads). Results show that decoder-based judges achieve the strongest overall performance, while uncertainty methods and encoders provide complementary strengths. Our findings highlight the value of combining uncertainty signals with encoder and decoder architectures for robust, multilingual detection of hallucinations and related errors in scientific publications.
%U https://aclanthology.org/2025.chomps-main.8/
%P 81-89
Markdown (Informal)
[SmurfCat at SHROOM-CAP: Factual but Awkward? Fluent but Wrong? Tackling Both in LLM Scientific QA](https://aclanthology.org/2025.chomps-main.8/) (Ionov et al., CHOMPS 2025)
ACL
- Timur Ionov, Evgenii Nikolaev, Artem Vazhentsev, Mikhail Chaichuk, Anton Korznikov, Elena Tutubalina, Alexander Panchenko, Vasily Konovalov, and Elisei Rykov. 2025. SmurfCat at SHROOM-CAP: Factual but Awkward? Fluent but Wrong? Tackling Both in LLM Scientific QA. In Proceedings of the 1st Workshop on Confabulation, Hallucinations and Overgeneration in Multilingual and Practical Settings (CHOMPS 2025), pages 81–89, Mumbai, India. Association for Computational Linguistics.