@inproceedings{riess-jorgensen-2025-brage,
title = "The {BRAGE} Benchmark: {Evaluating} Zero-shot Learning Capabilities of Large Language Models for {Norwegian} Customer Service Dialogues",
author = "Riess, Mike and
J{\o}rgensen, Tollef Emil",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.57/",
pages = "525--536",
ISBN = "978-9908-53-109-0",
abstract = "This study explores the capabilities of open-weight Large Language Models in a zero-shot learning setting, testing their ability to classify the content of customer service dialogues in Norwegian from a single instruction, named the BRAGE benchmark. By comparing results against widely used downstream tasks such as question-answering and named entity recognition, we find that (1) specific instruction models greatly exceed base models on the benchmark, (2) both English and multilingual instruction models outperform the tested Norwegian models of similar sizes, and (3) the difference between base and instruction models is less pronounced than in other generative tasks, suggesting that BRAGE is a challenging benchmark, requiring precise and generalizable instruction-tuning."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="riess-jorgensen-2025-brage">
<titleInfo>
<title>The BRAGE Benchmark: Evaluating Zero-shot Learning Capabilities of Large Language Models for Norwegian Customer Service Dialogues</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Riess</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tollef</namePart>
<namePart type="given">Emil</namePart>
<namePart type="family">Jørgensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>This study explores the capabilities of open-weight Large Language Models in a zero-shot learning setting, testing their ability to classify the content of customer service dialogues in Norwegian from a single instruction, named the BRAGE benchmark. By comparing results against widely used downstream tasks such as question-answering and named entity recognition, we find that (1) specific instruction models greatly exceed base models on the benchmark, (2) both English and multilingual instruction models outperform the tested Norwegian models of similar sizes, and (3) the difference between base and instruction models is less pronounced than in other generative tasks, suggesting that BRAGE is a challenging benchmark, requiring precise and generalizable instruction-tuning.</abstract>
<identifier type="citekey">riess-jorgensen-2025-brage</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.57/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>525</start>
<end>536</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The BRAGE Benchmark: Evaluating Zero-shot Learning Capabilities of Large Language Models for Norwegian Customer Service Dialogues
%A Riess, Mike
%A Jørgensen, Tollef Emil
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F riess-jorgensen-2025-brage
%X This study explores the capabilities of open-weight Large Language Models in a zero-shot learning setting, testing their ability to classify the content of customer service dialogues in Norwegian from a single instruction, named the BRAGE benchmark. By comparing results against widely used downstream tasks such as question-answering and named entity recognition, we find that (1) specific instruction models greatly exceed base models on the benchmark, (2) both English and multilingual instruction models outperform the tested Norwegian models of similar sizes, and (3) the difference between base and instruction models is less pronounced than in other generative tasks, suggesting that BRAGE is a challenging benchmark, requiring precise and generalizable instruction-tuning.
%U https://aclanthology.org/2025.nodalida-1.57/
%P 525-536
Markdown (Informal)
[The BRAGE Benchmark: Evaluating Zero-shot Learning Capabilities of Large Language Models for Norwegian Customer Service Dialogues](https://aclanthology.org/2025.nodalida-1.57/) (Riess & Jørgensen, NoDaLiDa 2025)
ACL