@inproceedings{humblot-renaux-etal-2026-llms,
title = "{LLM}s as annotators of credibility assessment in {D}anish asylum decisions: evaluating classification performance and errors beyond aggregated metrics",
author = "Humblot-Renaux, Galadrielle and
Jahromi, Mohammad N. S. and
Bakuri-J{\o}rgensen, Rohat and
Heyl, Marieke Anne and
Stage Jarlner, Asta S. and
Vlachou, Maria and
Murphy H{\o}genhaug, Anna and
Elliott, Desmond and
Gammeltoft-Hansen, Thomas and
Moeslund, Thomas B.",
editor = "Liu, Yang Janet and
Gessler, Luke",
booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.law-main.15/",
pages = "187--218",
ISBN = "979-8-89176-404-0",
abstract = "Off-the-shelf large language models (LLMs) are increasingly used to automate text annotation, yet their effectiveness remains underexplored for underrepresented languages and specialized domains where the class definition requires subtle expert understanding. We investigate LLM-based annotation for a novel legal NLP task: identifying the presence and sentiment of credibility assessments in asylum decision texts. We introduce RAB-Cred, a Danish text classification dataset featuring high-quality, expert annotations and valuable metadata such as annotator confidence and asylum case outcome. We benchmark 21 open-weight models and 30 system-user prompt combinations for this task, and systematically evaluate the effect of model and prompt choice for zero-shot and few-shot classification. We zoom in on the errors made by top-performing models and prompts, investigating error consistency across LLMs, inter-class confusion, correlation with human confidence and sample-wise difficulty and severity of LLM mistakes. Our results confirm the potential of LLMs for cost-effective labeling of asylum decisions, but highlight the imperfect and inconsistent nature of LLM annotators, and the need to look beyond the predictions of a single, arbitrarily chosen model. The RAB-Cred dataset and code are available at https://github.com/glhr/RAB-Cred"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="humblot-renaux-etal-2026-llms">
<titleInfo>
<title>LLMs as annotators of credibility assessment in Danish asylum decisions: evaluating classification performance and errors beyond aggregated metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galadrielle</namePart>
<namePart type="family">Humblot-Renaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">N</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Jahromi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohat</namePart>
<namePart type="family">Bakuri-Jørgensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marieke</namePart>
<namePart type="given">Anne</namePart>
<namePart type="family">Heyl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asta</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Stage Jarlner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Vlachou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Murphy Høgenhaug</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Gammeltoft-Hansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Moeslund</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="given">Janet</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Gessler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-404-0</identifier>
</relatedItem>
<abstract>Off-the-shelf large language models (LLMs) are increasingly used to automate text annotation, yet their effectiveness remains underexplored for underrepresented languages and specialized domains where the class definition requires subtle expert understanding. We investigate LLM-based annotation for a novel legal NLP task: identifying the presence and sentiment of credibility assessments in asylum decision texts. We introduce RAB-Cred, a Danish text classification dataset featuring high-quality, expert annotations and valuable metadata such as annotator confidence and asylum case outcome. We benchmark 21 open-weight models and 30 system-user prompt combinations for this task, and systematically evaluate the effect of model and prompt choice for zero-shot and few-shot classification. We zoom in on the errors made by top-performing models and prompts, investigating error consistency across LLMs, inter-class confusion, correlation with human confidence and sample-wise difficulty and severity of LLM mistakes. Our results confirm the potential of LLMs for cost-effective labeling of asylum decisions, but highlight the imperfect and inconsistent nature of LLM annotators, and the need to look beyond the predictions of a single, arbitrarily chosen model. The RAB-Cred dataset and code are available at https://github.com/glhr/RAB-Cred</abstract>
<identifier type="citekey">humblot-renaux-etal-2026-llms</identifier>
<location>
<url>https://aclanthology.org/2026.law-main.15/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>187</start>
<end>218</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLMs as annotators of credibility assessment in Danish asylum decisions: evaluating classification performance and errors beyond aggregated metrics
%A Humblot-Renaux, Galadrielle
%A Jahromi, Mohammad N. S.
%A Bakuri-Jørgensen, Rohat
%A Heyl, Marieke Anne
%A Stage Jarlner, Asta S.
%A Vlachou, Maria
%A Murphy Høgenhaug, Anna
%A Elliott, Desmond
%A Gammeltoft-Hansen, Thomas
%A Moeslund, Thomas B.
%Y Liu, Yang Janet
%Y Gessler, Luke
%S Proceedings of the 20th Linguistic Annotation Workshop (LAW XX)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-404-0
%F humblot-renaux-etal-2026-llms
%X Off-the-shelf large language models (LLMs) are increasingly used to automate text annotation, yet their effectiveness remains underexplored for underrepresented languages and specialized domains where the class definition requires subtle expert understanding. We investigate LLM-based annotation for a novel legal NLP task: identifying the presence and sentiment of credibility assessments in asylum decision texts. We introduce RAB-Cred, a Danish text classification dataset featuring high-quality, expert annotations and valuable metadata such as annotator confidence and asylum case outcome. We benchmark 21 open-weight models and 30 system-user prompt combinations for this task, and systematically evaluate the effect of model and prompt choice for zero-shot and few-shot classification. We zoom in on the errors made by top-performing models and prompts, investigating error consistency across LLMs, inter-class confusion, correlation with human confidence and sample-wise difficulty and severity of LLM mistakes. Our results confirm the potential of LLMs for cost-effective labeling of asylum decisions, but highlight the imperfect and inconsistent nature of LLM annotators, and the need to look beyond the predictions of a single, arbitrarily chosen model. The RAB-Cred dataset and code are available at https://github.com/glhr/RAB-Cred
%U https://aclanthology.org/2026.law-main.15/
%P 187-218
Markdown (Informal)
[LLMs as annotators of credibility assessment in Danish asylum decisions: evaluating classification performance and errors beyond aggregated metrics](https://aclanthology.org/2026.law-main.15/) (Humblot-Renaux et al., LAW 2026)
ACL
- Galadrielle Humblot-Renaux, Mohammad N. S. Jahromi, Rohat Bakuri-Jørgensen, Marieke Anne Heyl, Asta S. Stage Jarlner, Maria Vlachou, Anna Murphy Høgenhaug, Desmond Elliott, Thomas Gammeltoft-Hansen, and Thomas B. Moeslund. 2026. LLMs as annotators of credibility assessment in Danish asylum decisions: evaluating classification performance and errors beyond aggregated metrics. In Proceedings of the 20th Linguistic Annotation Workshop (LAW XX), pages 187–218, San Diego, California, USA. Association for Computational Linguistics.