@inproceedings{nangia-etal-2026-unsc,
title = "{UNSC}-Bench: Evaluating {LLM} Diplomatic Role-Playing Through {UN} Security Council Vote Prediction",
author = "Nangia, Ayush and
Gokrani, Aman and
Lazzaroni, Ruggero Marino",
editor = "Chen, Pinzhen and
Zouhar, Vil{\'e}m and
Hu, Hanxu and
Khanuja, Simran and
Zhu, Wenhao and
Haddow, Barry and
Birch, Alexandra and
Aji, Alham Fikri and
Sennrich, Rico and
Hooker, Sara",
booktitle = "Proceedings of the First Workshop on Multilingual Multicultural Evaluation",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mme-main.10/",
pages = "162--176",
ISBN = "979-8-89176-368-5",
abstract = "This paper introduces UNSC-Bench, a benchmark for evaluating Large Language Models (LLMs) in simulating diplomatic decision-making through United Nations Security Council (UNSC) vote prediction. The dataset includes 469 UNSC resolutions from 1947 to 2025, with voting records for the five permanent members (P5) (United States, China, France, Russia, United Kingdom) and translations in four languages. We analyze 26 LLMs, along with thinking variants, across multiple P5 roles and find that (1) without explicit role assignment, models are diplomatically unaligned, defaulting to high yes rates and failing to match any P5 voting pattern, indicating they lack inherent diplomatic identity; (2) model capability (as measured by MMLU-Pro) is strongly correlated with role-playing accuracy; (3) regional models do not outperform others in predicting their home country{'}s votes; and (4) multilingual evaluation reveals that prompt language impacts model predictions, particularly for minority vote outcomes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nangia-etal-2026-unsc">
<titleInfo>
<title>UNSC-Bench: Evaluating LLM Diplomatic Role-Playing Through UN Security Council Vote Prediction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Nangia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Gokrani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruggero</namePart>
<namePart type="given">Marino</namePart>
<namePart type="family">Lazzaroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Multilingual Multicultural Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanxu</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simran</namePart>
<namePart type="family">Khanuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-368-5</identifier>
</relatedItem>
<abstract>This paper introduces UNSC-Bench, a benchmark for evaluating Large Language Models (LLMs) in simulating diplomatic decision-making through United Nations Security Council (UNSC) vote prediction. The dataset includes 469 UNSC resolutions from 1947 to 2025, with voting records for the five permanent members (P5) (United States, China, France, Russia, United Kingdom) and translations in four languages. We analyze 26 LLMs, along with thinking variants, across multiple P5 roles and find that (1) without explicit role assignment, models are diplomatically unaligned, defaulting to high yes rates and failing to match any P5 voting pattern, indicating they lack inherent diplomatic identity; (2) model capability (as measured by MMLU-Pro) is strongly correlated with role-playing accuracy; (3) regional models do not outperform others in predicting their home country’s votes; and (4) multilingual evaluation reveals that prompt language impacts model predictions, particularly for minority vote outcomes.</abstract>
<identifier type="citekey">nangia-etal-2026-unsc</identifier>
<location>
<url>https://aclanthology.org/2026.mme-main.10/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>162</start>
<end>176</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UNSC-Bench: Evaluating LLM Diplomatic Role-Playing Through UN Security Council Vote Prediction
%A Nangia, Ayush
%A Gokrani, Aman
%A Lazzaroni, Ruggero Marino
%Y Chen, Pinzhen
%Y Zouhar, Vilém
%Y Hu, Hanxu
%Y Khanuja, Simran
%Y Zhu, Wenhao
%Y Haddow, Barry
%Y Birch, Alexandra
%Y Aji, Alham Fikri
%Y Sennrich, Rico
%Y Hooker, Sara
%S Proceedings of the First Workshop on Multilingual Multicultural Evaluation
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-368-5
%F nangia-etal-2026-unsc
%X This paper introduces UNSC-Bench, a benchmark for evaluating Large Language Models (LLMs) in simulating diplomatic decision-making through United Nations Security Council (UNSC) vote prediction. The dataset includes 469 UNSC resolutions from 1947 to 2025, with voting records for the five permanent members (P5) (United States, China, France, Russia, United Kingdom) and translations in four languages. We analyze 26 LLMs, along with thinking variants, across multiple P5 roles and find that (1) without explicit role assignment, models are diplomatically unaligned, defaulting to high yes rates and failing to match any P5 voting pattern, indicating they lack inherent diplomatic identity; (2) model capability (as measured by MMLU-Pro) is strongly correlated with role-playing accuracy; (3) regional models do not outperform others in predicting their home country’s votes; and (4) multilingual evaluation reveals that prompt language impacts model predictions, particularly for minority vote outcomes.
%U https://aclanthology.org/2026.mme-main.10/
%P 162-176
Markdown (Informal)
[UNSC-Bench: Evaluating LLM Diplomatic Role-Playing Through UN Security Council Vote Prediction](https://aclanthology.org/2026.mme-main.10/) (Nangia et al., MME 2026)
ACL