@inproceedings{grzybowski-etal-2025-polish,
title = "{P}olish-{E}nglish medical knowledge transfer: A new benchmark and results",
author = "Grzybowski, {\L}ukasz and
Pokrywka, Jakub and
Ciesi{\'o}{\l}ka, Micha{\l} and
Kaczmarek, Jeremi Ignacy and
Kubis, Marek",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.480/",
doi = "10.18653/v1/2025.findings-emnlp.480",
pages = "9042--9063",
ISBN = "979-8-89176-335-7",
abstract = "Large Language Models (LLMs) have demonstrated significant potential in specialized tasks, including medical problem-solving. However, most studies predominantly focus on English-language contexts. This study introduces a novel benchmark dataset based on Polish medical licensing and specialization exams (LEK, LDEK, PES). The dataset, sourced from publicly available materials provided by the Medical Examination Center and the Chief Medical Chamber, includes Polish medical exam questions, along with a subset of parallel Polish-English corpora professionally translated for foreign candidates. By structuring a benchmark from these exam questions, we evaluate state-of-the-art LLMs, spanning general-purpose, domain-specific, and Polish-specific models, and compare their performance with that of human medical students and doctors. Our analysis shows that while models like GPT-4o achieve near-human performance, challenges persist in cross-lingual translation and domain-specific understanding. These findings highlight disparities in model performance across languages and medical specialties, emphasizing the limitations and ethical considerations of deploying LLMs in clinical practice."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="grzybowski-etal-2025-polish">
<titleInfo>
<title>Polish-English medical knowledge transfer: A new benchmark and results</title>
</titleInfo>
<name type="personal">
<namePart type="given">Łukasz</namePart>
<namePart type="family">Grzybowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Pokrywka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="family">Ciesiółka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremi</namePart>
<namePart type="given">Ignacy</namePart>
<namePart type="family">Kaczmarek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Kubis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have demonstrated significant potential in specialized tasks, including medical problem-solving. However, most studies predominantly focus on English-language contexts. This study introduces a novel benchmark dataset based on Polish medical licensing and specialization exams (LEK, LDEK, PES). The dataset, sourced from publicly available materials provided by the Medical Examination Center and the Chief Medical Chamber, includes Polish medical exam questions, along with a subset of parallel Polish-English corpora professionally translated for foreign candidates. By structuring a benchmark from these exam questions, we evaluate state-of-the-art LLMs, spanning general-purpose, domain-specific, and Polish-specific models, and compare their performance with that of human medical students and doctors. Our analysis shows that while models like GPT-4o achieve near-human performance, challenges persist in cross-lingual translation and domain-specific understanding. These findings highlight disparities in model performance across languages and medical specialties, emphasizing the limitations and ethical considerations of deploying LLMs in clinical practice.</abstract>
<identifier type="citekey">grzybowski-etal-2025-polish</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.480</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.480/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>9042</start>
<end>9063</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Polish-English medical knowledge transfer: A new benchmark and results
%A Grzybowski, Łukasz
%A Pokrywka, Jakub
%A Ciesiółka, Michał
%A Kaczmarek, Jeremi Ignacy
%A Kubis, Marek
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F grzybowski-etal-2025-polish
%X Large Language Models (LLMs) have demonstrated significant potential in specialized tasks, including medical problem-solving. However, most studies predominantly focus on English-language contexts. This study introduces a novel benchmark dataset based on Polish medical licensing and specialization exams (LEK, LDEK, PES). The dataset, sourced from publicly available materials provided by the Medical Examination Center and the Chief Medical Chamber, includes Polish medical exam questions, along with a subset of parallel Polish-English corpora professionally translated for foreign candidates. By structuring a benchmark from these exam questions, we evaluate state-of-the-art LLMs, spanning general-purpose, domain-specific, and Polish-specific models, and compare their performance with that of human medical students and doctors. Our analysis shows that while models like GPT-4o achieve near-human performance, challenges persist in cross-lingual translation and domain-specific understanding. These findings highlight disparities in model performance across languages and medical specialties, emphasizing the limitations and ethical considerations of deploying LLMs in clinical practice.
%R 10.18653/v1/2025.findings-emnlp.480
%U https://aclanthology.org/2025.findings-emnlp.480/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.480
%P 9042-9063
Markdown (Informal)
[Polish-English medical knowledge transfer: A new benchmark and results](https://aclanthology.org/2025.findings-emnlp.480/) (Grzybowski et al., Findings 2025)
ACL