@inproceedings{sobhani-etal-2025-language,
title = "Language Models can Categorize System Inputs for Performance Analysis",
author = "Sobhani, Dominic and
Zhong, Ruiqi and
Marrese-Taylor, Edison and
Sakaguchi, Keisuke and
Matsuo, Yutaka",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.317/",
doi = "10.18653/v1/2025.naacl-long.317",
pages = "6241--6257",
ISBN = "979-8-89176-189-6",
abstract = "Language model systems are used to process diverse categories of input requests, ranging from improving creative writing to solving programming challenges. It would be useful to know which categories they are good at. However, existing evaluations compare model performance on pre-defined categories, failing to reflect a system{'}s performance on finer-grained or novel ones. We propose to automatically search for finer-grained categories based on inputs where a system performs well or poorly, and describe them in natural language. To search for these categories, we propose a large number of candidate category descriptions, e.g. ``Communication Improvement'', find the subset of inputs that match the category descriptions, and calculate the performance on these categories; then we sort these categories based on their performance, thereby highlighting those that score high or low. As one application, we apply our method to compare LLaMA 3-70B and Claude 3 Opus, which have similar Elo-ratings on Chatbot Arena; our method finds the former is weaker at making text more professional and humorous while better at providing psychological insights, depicting a more nuanced picture of model performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sobhani-etal-2025-language">
<titleInfo>
<title>Language Models can Categorize System Inputs for Performance Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dominic</namePart>
<namePart type="family">Sobhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiqi</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edison</namePart>
<namePart type="family">Marrese-Taylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keisuke</namePart>
<namePart type="family">Sakaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutaka</namePart>
<namePart type="family">Matsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Language model systems are used to process diverse categories of input requests, ranging from improving creative writing to solving programming challenges. It would be useful to know which categories they are good at. However, existing evaluations compare model performance on pre-defined categories, failing to reflect a system’s performance on finer-grained or novel ones. We propose to automatically search for finer-grained categories based on inputs where a system performs well or poorly, and describe them in natural language. To search for these categories, we propose a large number of candidate category descriptions, e.g. “Communication Improvement”, find the subset of inputs that match the category descriptions, and calculate the performance on these categories; then we sort these categories based on their performance, thereby highlighting those that score high or low. As one application, we apply our method to compare LLaMA 3-70B and Claude 3 Opus, which have similar Elo-ratings on Chatbot Arena; our method finds the former is weaker at making text more professional and humorous while better at providing psychological insights, depicting a more nuanced picture of model performance.</abstract>
<identifier type="citekey">sobhani-etal-2025-language</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.317</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.317/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6241</start>
<end>6257</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Models can Categorize System Inputs for Performance Analysis
%A Sobhani, Dominic
%A Zhong, Ruiqi
%A Marrese-Taylor, Edison
%A Sakaguchi, Keisuke
%A Matsuo, Yutaka
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F sobhani-etal-2025-language
%X Language model systems are used to process diverse categories of input requests, ranging from improving creative writing to solving programming challenges. It would be useful to know which categories they are good at. However, existing evaluations compare model performance on pre-defined categories, failing to reflect a system’s performance on finer-grained or novel ones. We propose to automatically search for finer-grained categories based on inputs where a system performs well or poorly, and describe them in natural language. To search for these categories, we propose a large number of candidate category descriptions, e.g. “Communication Improvement”, find the subset of inputs that match the category descriptions, and calculate the performance on these categories; then we sort these categories based on their performance, thereby highlighting those that score high or low. As one application, we apply our method to compare LLaMA 3-70B and Claude 3 Opus, which have similar Elo-ratings on Chatbot Arena; our method finds the former is weaker at making text more professional and humorous while better at providing psychological insights, depicting a more nuanced picture of model performance.
%R 10.18653/v1/2025.naacl-long.317
%U https://aclanthology.org/2025.naacl-long.317/
%U https://doi.org/10.18653/v1/2025.naacl-long.317
%P 6241-6257
Markdown (Informal)
[Language Models can Categorize System Inputs for Performance Analysis](https://aclanthology.org/2025.naacl-long.317/) (Sobhani et al., NAACL 2025)
ACL
- Dominic Sobhani, Ruiqi Zhong, Edison Marrese-Taylor, Keisuke Sakaguchi, and Yutaka Matsuo. 2025. Language Models can Categorize System Inputs for Performance Analysis. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 6241–6257, Albuquerque, New Mexico. Association for Computational Linguistics.