@inproceedings{sanchez-salido-etal-2025-bilingual,
title = "Bilingual Evaluation of Language Models on General Knowledge in University Entrance Exams with Minimal Contamination",
author = "S{\'a}nchez Salido, Eva and
Morante, Roser and
Gonzalo, Julio and
Marco, Guillermo and
Carrillo-de-Albornoz, Jorge and
Plaza, Laura and
Amigo, Enrique and
Garc{\'i}a, Andr{\'e}s Fernandez and
Benito-Santos, Alejandro and
Ghajari Espinosa, Adri{\'a}n and
Fresno, Victor",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.413/",
pages = "6184--6200",
abstract = "In this article we present UNED-ACCESS 2024, a bilingual dataset that consists of 1003 multiple-choice questions of university entrance level exams in Spanish and English. Questions are originally formulated in Spanish and manually translated into English, and have not ever been publicly released, ensuring minimal contamination when evaluating Large Language Models with this dataset. A selection of current open-source and proprietary models are evaluated in a uniform zero-shot experimental setting both on the UNED-ACCESS 2024 dataset and on an equivalent subset of MMLU questions. Results show that (i) Smaller models not only perform worse than the largest models, but also degrade faster in Spanish than in English. The performance gap between both languages is negligible for the best models, but grows up to 37{\%} for smaller models; (ii) Model ranking on UNED-ACCESS 2024 is almost identical (0.98 Pearson correlation) to the one obtained with MMLU (a similar, but publicly available benchmark), suggesting that contamination affects similarly to all models, and (iii) As in publicly available datasets, reasoning questions in UNED-ACCESS are more challenging for models of all sizes."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sanchez-salido-etal-2025-bilingual">
<titleInfo>
<title>Bilingual Evaluation of Language Models on General Knowledge in University Entrance Exams with Minimal Contamination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eva</namePart>
<namePart type="family">Sánchez Salido</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roser</namePart>
<namePart type="family">Morante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julio</namePart>
<namePart type="family">Gonzalo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Marco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Carrillo-de-Albornoz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Plaza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrique</namePart>
<namePart type="family">Amigo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrés</namePart>
<namePart type="given">Fernandez</namePart>
<namePart type="family">García</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alejandro</namePart>
<namePart type="family">Benito-Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrián</namePart>
<namePart type="family">Ghajari Espinosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Fresno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this article we present UNED-ACCESS 2024, a bilingual dataset that consists of 1003 multiple-choice questions of university entrance level exams in Spanish and English. Questions are originally formulated in Spanish and manually translated into English, and have not ever been publicly released, ensuring minimal contamination when evaluating Large Language Models with this dataset. A selection of current open-source and proprietary models are evaluated in a uniform zero-shot experimental setting both on the UNED-ACCESS 2024 dataset and on an equivalent subset of MMLU questions. Results show that (i) Smaller models not only perform worse than the largest models, but also degrade faster in Spanish than in English. The performance gap between both languages is negligible for the best models, but grows up to 37% for smaller models; (ii) Model ranking on UNED-ACCESS 2024 is almost identical (0.98 Pearson correlation) to the one obtained with MMLU (a similar, but publicly available benchmark), suggesting that contamination affects similarly to all models, and (iii) As in publicly available datasets, reasoning questions in UNED-ACCESS are more challenging for models of all sizes.</abstract>
<identifier type="citekey">sanchez-salido-etal-2025-bilingual</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.413/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6184</start>
<end>6200</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bilingual Evaluation of Language Models on General Knowledge in University Entrance Exams with Minimal Contamination
%A Sánchez Salido, Eva
%A Morante, Roser
%A Gonzalo, Julio
%A Marco, Guillermo
%A Carrillo-de-Albornoz, Jorge
%A Plaza, Laura
%A Amigo, Enrique
%A García, Andrés Fernandez
%A Benito-Santos, Alejandro
%A Ghajari Espinosa, Adrián
%A Fresno, Victor
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F sanchez-salido-etal-2025-bilingual
%X In this article we present UNED-ACCESS 2024, a bilingual dataset that consists of 1003 multiple-choice questions of university entrance level exams in Spanish and English. Questions are originally formulated in Spanish and manually translated into English, and have not ever been publicly released, ensuring minimal contamination when evaluating Large Language Models with this dataset. A selection of current open-source and proprietary models are evaluated in a uniform zero-shot experimental setting both on the UNED-ACCESS 2024 dataset and on an equivalent subset of MMLU questions. Results show that (i) Smaller models not only perform worse than the largest models, but also degrade faster in Spanish than in English. The performance gap between both languages is negligible for the best models, but grows up to 37% for smaller models; (ii) Model ranking on UNED-ACCESS 2024 is almost identical (0.98 Pearson correlation) to the one obtained with MMLU (a similar, but publicly available benchmark), suggesting that contamination affects similarly to all models, and (iii) As in publicly available datasets, reasoning questions in UNED-ACCESS are more challenging for models of all sizes.
%U https://aclanthology.org/2025.coling-main.413/
%P 6184-6200
Markdown (Informal)
[Bilingual Evaluation of Language Models on General Knowledge in University Entrance Exams with Minimal Contamination](https://aclanthology.org/2025.coling-main.413/) (Sánchez Salido et al., COLING 2025)
ACL
- Eva Sánchez Salido, Roser Morante, Julio Gonzalo, Guillermo Marco, Jorge Carrillo-de-Albornoz, Laura Plaza, Enrique Amigo, Andrés Fernandez García, Alejandro Benito-Santos, Adrián Ghajari Espinosa, and Victor Fresno. 2025. Bilingual Evaluation of Language Models on General Knowledge in University Entrance Exams with Minimal Contamination. In Proceedings of the 31st International Conference on Computational Linguistics, pages 6184–6200, Abu Dhabi, UAE. Association for Computational Linguistics.