@inproceedings{nam-etal-2026-idealign,
title = "{IDEA}lign: Comparing Ideas of Large Language Models to Domain Experts",
author = "Nam, HyunJi and
Langlois, Luc{\'i}a and
Malamut, Jim and
Tan, Mei and
Demszky, Dorottya",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.182/",
pages = "3908--3925",
ISBN = "979-8-89176-380-7",
abstract = "Large language models (LLMs) are increasingly used to produce open-ended, interpretive annotations, yet there is no validated, scalable measure of ***idea-level similarity*** to expert annotations. We (i) introduce the content evaluation of LLM annotations as a core, understudied task, (ii) propose IDEAlign for capturing expert similarity judgments via the odd-one-out tasks, and (iii) benchmark various similarity methods, such as text embeddings, topic models, and LLM-as-a-judge, against these human ratings. Applying this approach to two real-world educational datasets (interpreting math reasoning and feedback generation), we find that most metrics fail to capture the nuanced dimensions of similarity meaningful to experts. LLM-as-a-judge performs best (11{--}18{\%} improvement over other methods) but still falls short of expert alignment, making it useful as a triage filter rather than a substitute for human review. Our work demonstrates the difficulty of evaluating open-ended LLM annotations at scale, and positions IDEAlign as a reusable protocol for benchmarking on this task, thereby informing responsible deployment of LLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nam-etal-2026-idealign">
<titleInfo>
<title>IDEAlign: Comparing Ideas of Large Language Models to Domain Experts</title>
</titleInfo>
<name type="personal">
<namePart type="given">HyunJi</namePart>
<namePart type="family">Nam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucía</namePart>
<namePart type="family">Langlois</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jim</namePart>
<namePart type="family">Malamut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dorottya</namePart>
<namePart type="family">Demszky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are increasingly used to produce open-ended, interpretive annotations, yet there is no validated, scalable measure of ***idea-level similarity*** to expert annotations. We (i) introduce the content evaluation of LLM annotations as a core, understudied task, (ii) propose IDEAlign for capturing expert similarity judgments via the odd-one-out tasks, and (iii) benchmark various similarity methods, such as text embeddings, topic models, and LLM-as-a-judge, against these human ratings. Applying this approach to two real-world educational datasets (interpreting math reasoning and feedback generation), we find that most metrics fail to capture the nuanced dimensions of similarity meaningful to experts. LLM-as-a-judge performs best (11–18% improvement over other methods) but still falls short of expert alignment, making it useful as a triage filter rather than a substitute for human review. Our work demonstrates the difficulty of evaluating open-ended LLM annotations at scale, and positions IDEAlign as a reusable protocol for benchmarking on this task, thereby informing responsible deployment of LLMs.</abstract>
<identifier type="citekey">nam-etal-2026-idealign</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.182/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>3908</start>
<end>3925</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IDEAlign: Comparing Ideas of Large Language Models to Domain Experts
%A Nam, HyunJi
%A Langlois, Lucía
%A Malamut, Jim
%A Tan, Mei
%A Demszky, Dorottya
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F nam-etal-2026-idealign
%X Large language models (LLMs) are increasingly used to produce open-ended, interpretive annotations, yet there is no validated, scalable measure of ***idea-level similarity*** to expert annotations. We (i) introduce the content evaluation of LLM annotations as a core, understudied task, (ii) propose IDEAlign for capturing expert similarity judgments via the odd-one-out tasks, and (iii) benchmark various similarity methods, such as text embeddings, topic models, and LLM-as-a-judge, against these human ratings. Applying this approach to two real-world educational datasets (interpreting math reasoning and feedback generation), we find that most metrics fail to capture the nuanced dimensions of similarity meaningful to experts. LLM-as-a-judge performs best (11–18% improvement over other methods) but still falls short of expert alignment, making it useful as a triage filter rather than a substitute for human review. Our work demonstrates the difficulty of evaluating open-ended LLM annotations at scale, and positions IDEAlign as a reusable protocol for benchmarking on this task, thereby informing responsible deployment of LLMs.
%U https://aclanthology.org/2026.eacl-long.182/
%P 3908-3925
Markdown (Informal)
[IDEAlign: Comparing Ideas of Large Language Models to Domain Experts](https://aclanthology.org/2026.eacl-long.182/) (Nam et al., EACL 2026)
ACL
- HyunJi Nam, Lucía Langlois, Jim Malamut, Mei Tan, and Dorottya Demszky. 2026. IDEAlign: Comparing Ideas of Large Language Models to Domain Experts. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 3908–3925, Rabat, Morocco. Association for Computational Linguistics.