@inproceedings{cocchieri-etal-2026-remedqa,
title = "{R}e{M}ed{QA}: Are We Done With Medical Multiple-Choice Benchmarks?",
author = "Cocchieri, Alessio and
Ragazzi, Luca and
Tagliavini, Giuseppe and
Moro, Gianluca",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.124/",
pages = "2706--2738",
ISBN = "979-8-89176-380-7",
abstract = "Medical multiple-choice question answering (MCQA) benchmarks show that models achieve near-human accuracy, with some benchmarks approaching saturation{--}leading to claims of clinical readiness. Yet a single accuracy score is a poor proxy for competence: models that change answers under minor input perturbations cannot be considered reliable. We argue that reliability underpins accuracy{--}only consistent predictions make correctness meaningful. We release ReMedQA, a new benchmark that augments three standard medical MCQA datasets with open-ended answers and systematically perturbed options. Building on this, we introduce ReAcc and ReCon, two reliability metrics: ReAcc measures the proportion of questions answered correctly across all variations, while ReCon measures the proportion answered consistently regardless of correctness. Our evaluation shows that high MCQA accuracy masks low reliability: models remain sensitive to format and perturbation changes, and domain specialization offers no robustness gain. MCQA underestimates smaller models while inflating large ones that exploit structural cues{--}with some exceeding 50{\%} accuracy even when the original questions are hidden. This shows that, despite near-saturated accuracy, we are not yet done with medical MCQA benchmarks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cocchieri-etal-2026-remedqa">
<titleInfo>
<title>ReMedQA: Are We Done With Medical Multiple-Choice Benchmarks?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alessio</namePart>
<namePart type="family">Cocchieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Ragazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Tagliavini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gianluca</namePart>
<namePart type="family">Moro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Medical multiple-choice question answering (MCQA) benchmarks show that models achieve near-human accuracy, with some benchmarks approaching saturation–leading to claims of clinical readiness. Yet a single accuracy score is a poor proxy for competence: models that change answers under minor input perturbations cannot be considered reliable. We argue that reliability underpins accuracy–only consistent predictions make correctness meaningful. We release ReMedQA, a new benchmark that augments three standard medical MCQA datasets with open-ended answers and systematically perturbed options. Building on this, we introduce ReAcc and ReCon, two reliability metrics: ReAcc measures the proportion of questions answered correctly across all variations, while ReCon measures the proportion answered consistently regardless of correctness. Our evaluation shows that high MCQA accuracy masks low reliability: models remain sensitive to format and perturbation changes, and domain specialization offers no robustness gain. MCQA underestimates smaller models while inflating large ones that exploit structural cues–with some exceeding 50% accuracy even when the original questions are hidden. This shows that, despite near-saturated accuracy, we are not yet done with medical MCQA benchmarks.</abstract>
<identifier type="citekey">cocchieri-etal-2026-remedqa</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.124/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2706</start>
<end>2738</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReMedQA: Are We Done With Medical Multiple-Choice Benchmarks?
%A Cocchieri, Alessio
%A Ragazzi, Luca
%A Tagliavini, Giuseppe
%A Moro, Gianluca
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F cocchieri-etal-2026-remedqa
%X Medical multiple-choice question answering (MCQA) benchmarks show that models achieve near-human accuracy, with some benchmarks approaching saturation–leading to claims of clinical readiness. Yet a single accuracy score is a poor proxy for competence: models that change answers under minor input perturbations cannot be considered reliable. We argue that reliability underpins accuracy–only consistent predictions make correctness meaningful. We release ReMedQA, a new benchmark that augments three standard medical MCQA datasets with open-ended answers and systematically perturbed options. Building on this, we introduce ReAcc and ReCon, two reliability metrics: ReAcc measures the proportion of questions answered correctly across all variations, while ReCon measures the proportion answered consistently regardless of correctness. Our evaluation shows that high MCQA accuracy masks low reliability: models remain sensitive to format and perturbation changes, and domain specialization offers no robustness gain. MCQA underestimates smaller models while inflating large ones that exploit structural cues–with some exceeding 50% accuracy even when the original questions are hidden. This shows that, despite near-saturated accuracy, we are not yet done with medical MCQA benchmarks.
%U https://aclanthology.org/2026.eacl-long.124/
%P 2706-2738
Markdown (Informal)
[ReMedQA: Are We Done With Medical Multiple-Choice Benchmarks?](https://aclanthology.org/2026.eacl-long.124/) (Cocchieri et al., EACL 2026)
ACL
- Alessio Cocchieri, Luca Ragazzi, Giuseppe Tagliavini, and Gianluca Moro. 2026. ReMedQA: Are We Done With Medical Multiple-Choice Benchmarks?. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 2706–2738, Rabat, Morocco. Association for Computational Linguistics.