@inproceedings{ligeti-nagy-etal-2025-hugme,
title = "{H}u{GME}: A benchmark system for evaluating {H}ungarian generative {LLM}s",
author = "Ligeti-Nagy, No{\'e}mi and
Madarasz, Gabor and
Foldesi, Flora and
Lengyel, Mariann and
Osvath, Matyas and
Sarossy, Bence and
Varga, Kristof and
Yang, Gy{\H{o}}z{\H{o}} Zijian and
H{\'e}ja, Enik{\H{o}} and
V{\'a}radi, Tam{\'a}s and
Pr{\'o}sz{\'e}ky, G{\'a}bor",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.32/",
pages = "385--403",
ISBN = "979-8-89176-261-9",
abstract = "In this study, we introduce the Hungarian Generative Model Evaluation (HuGME) benchmark, a new framework designed to assess the linguistic proficiency of large language models (LLMs) in Hungarian. HuGME evaluates models across a diverse set of linguistic and reasoning skills, including bias, toxicity, faithfulness, relevance, summarization, prompt alignment, readability, spelling, grammaticality, and domain-specific knowledge through tasks like TruthfulQA and MMLU. We applied HuGME to a range of Hungarian LLMs, including those developed in-house as well as several publicly available models that claim Hungarian language proficiency. This paper presents the comparative results of these evaluations, shedding light on the capabilities of current LLMs in processing the Hungarian language. Through our analysis, we aim to both showcase the current state of Hungarian linguistic processing in LLMs and provide a foundational resource for future advancements in the field."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ligeti-nagy-etal-2025-hugme">
<titleInfo>
<title>HuGME: A benchmark system for evaluating Hungarian generative LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Noémi</namePart>
<namePart type="family">Ligeti-Nagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabor</namePart>
<namePart type="family">Madarasz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flora</namePart>
<namePart type="family">Foldesi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariann</namePart>
<namePart type="family">Lengyel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matyas</namePart>
<namePart type="family">Osvath</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bence</namePart>
<namePart type="family">Sarossy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristof</namePart>
<namePart type="family">Varga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Győző</namePart>
<namePart type="given">Zijian</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enikő</namePart>
<namePart type="family">Héja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tamás</namePart>
<namePart type="family">Váradi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gábor</namePart>
<namePart type="family">Prószéky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>In this study, we introduce the Hungarian Generative Model Evaluation (HuGME) benchmark, a new framework designed to assess the linguistic proficiency of large language models (LLMs) in Hungarian. HuGME evaluates models across a diverse set of linguistic and reasoning skills, including bias, toxicity, faithfulness, relevance, summarization, prompt alignment, readability, spelling, grammaticality, and domain-specific knowledge through tasks like TruthfulQA and MMLU. We applied HuGME to a range of Hungarian LLMs, including those developed in-house as well as several publicly available models that claim Hungarian language proficiency. This paper presents the comparative results of these evaluations, shedding light on the capabilities of current LLMs in processing the Hungarian language. Through our analysis, we aim to both showcase the current state of Hungarian linguistic processing in LLMs and provide a foundational resource for future advancements in the field.</abstract>
<identifier type="citekey">ligeti-nagy-etal-2025-hugme</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.32/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>385</start>
<end>403</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HuGME: A benchmark system for evaluating Hungarian generative LLMs
%A Ligeti-Nagy, Noémi
%A Madarasz, Gabor
%A Foldesi, Flora
%A Lengyel, Mariann
%A Osvath, Matyas
%A Sarossy, Bence
%A Varga, Kristof
%A Yang, Győző Zijian
%A Héja, Enikő
%A Váradi, Tamás
%A Prószéky, Gábor
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F ligeti-nagy-etal-2025-hugme
%X In this study, we introduce the Hungarian Generative Model Evaluation (HuGME) benchmark, a new framework designed to assess the linguistic proficiency of large language models (LLMs) in Hungarian. HuGME evaluates models across a diverse set of linguistic and reasoning skills, including bias, toxicity, faithfulness, relevance, summarization, prompt alignment, readability, spelling, grammaticality, and domain-specific knowledge through tasks like TruthfulQA and MMLU. We applied HuGME to a range of Hungarian LLMs, including those developed in-house as well as several publicly available models that claim Hungarian language proficiency. This paper presents the comparative results of these evaluations, shedding light on the capabilities of current LLMs in processing the Hungarian language. Through our analysis, we aim to both showcase the current state of Hungarian linguistic processing in LLMs and provide a foundational resource for future advancements in the field.
%U https://aclanthology.org/2025.gem-1.32/
%P 385-403
Markdown (Informal)
[HuGME: A benchmark system for evaluating Hungarian generative LLMs](https://aclanthology.org/2025.gem-1.32/) (Ligeti-Nagy et al., GEM 2025)
ACL
- Noémi Ligeti-Nagy, Gabor Madarasz, Flora Foldesi, Mariann Lengyel, Matyas Osvath, Bence Sarossy, Kristof Varga, Győző Zijian Yang, Enikő Héja, Tamás Váradi, and Gábor Prószéky. 2025. HuGME: A benchmark system for evaluating Hungarian generative LLMs. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 385–403, Vienna, Austria and virtual meeting. Association for Computational Linguistics.