@inproceedings{brown-etal-2025-adaptively,
title = "Adaptively profiling models with task elicitation",
author = "Brown, Davis and
Balehannina, Prithvi and
Jin, Helen and
Havaldar, Shreya and
Hassani, Hamed and
Wong, Eric",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1270/",
pages = "24996--25031",
ISBN = "979-8-89176-332-6",
abstract = "Language model evaluations often fail to characterize consequential failure modes, forcing experts to inspect outputs and build new benchmarks. We introduce task elicitation, a method that automatically builds new evaluations to profile model behavior. Task elicitation finds hundreds of natural-language tasks{---}an order of magnitude more than prior work{---}where frontier models exhibit systematic failures, in domains ranging from forecasting to online harassment. For example, we find that Sonnet 3.5 over-associates quantum computing and AGI and that o3-mini is prone to hallucination when fabrications are repeated in-context."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brown-etal-2025-adaptively">
<titleInfo>
<title>Adaptively profiling models with task elicitation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Davis</namePart>
<namePart type="family">Brown</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prithvi</namePart>
<namePart type="family">Balehannina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helen</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreya</namePart>
<namePart type="family">Havaldar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamed</namePart>
<namePart type="family">Hassani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Language model evaluations often fail to characterize consequential failure modes, forcing experts to inspect outputs and build new benchmarks. We introduce task elicitation, a method that automatically builds new evaluations to profile model behavior. Task elicitation finds hundreds of natural-language tasks—an order of magnitude more than prior work—where frontier models exhibit systematic failures, in domains ranging from forecasting to online harassment. For example, we find that Sonnet 3.5 over-associates quantum computing and AGI and that o3-mini is prone to hallucination when fabrications are repeated in-context.</abstract>
<identifier type="citekey">brown-etal-2025-adaptively</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1270/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>24996</start>
<end>25031</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adaptively profiling models with task elicitation
%A Brown, Davis
%A Balehannina, Prithvi
%A Jin, Helen
%A Havaldar, Shreya
%A Hassani, Hamed
%A Wong, Eric
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F brown-etal-2025-adaptively
%X Language model evaluations often fail to characterize consequential failure modes, forcing experts to inspect outputs and build new benchmarks. We introduce task elicitation, a method that automatically builds new evaluations to profile model behavior. Task elicitation finds hundreds of natural-language tasks—an order of magnitude more than prior work—where frontier models exhibit systematic failures, in domains ranging from forecasting to online harassment. For example, we find that Sonnet 3.5 over-associates quantum computing and AGI and that o3-mini is prone to hallucination when fabrications are repeated in-context.
%U https://aclanthology.org/2025.emnlp-main.1270/
%P 24996-25031
Markdown (Informal)
[Adaptively profiling models with task elicitation](https://aclanthology.org/2025.emnlp-main.1270/) (Brown et al., EMNLP 2025)
ACL
- Davis Brown, Prithvi Balehannina, Helen Jin, Shreya Havaldar, Hamed Hassani, and Eric Wong. 2025. Adaptively profiling models with task elicitation. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 24996–25031, Suzhou, China. Association for Computational Linguistics.