@inproceedings{gor-etal-2024-great,
title = "Do great minds think alike? Investigating Human-{AI} Complementarity in Question Answering with {CAIMIRA}",
author = "Gor, Maharshi and
Daum{\'e} Iii, Hal and
Zhou, Tianyi and
Boyd-Graber, Jordan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1201",
pages = "21533--21564",
abstract = "Recent advancements of large language models (LLMs)have led to claims of AI surpassing humansin natural language processing NLP tasks such as textual understanding and reasoning.{\%}This work investigates these assertions by introducingCAIMIRA, a novel framework rooted in item response theory IRTthat enables quantitative assessment and comparison of problem-solving abilities inquestion-answering QA agents.{\%}Through analysis of over 300,000 responses from {\textasciitilde} 70 AI systemsand 155 humans across thousands of quiz questions, CAIMIRA uncovers distinctproficiency patterns in knowledge domains and reasoning skills. {\%}Humans outperform AI systems in knowledge-grounded abductive and conceptual reasoning,while state-of-the-art LLMs like GPT-4 Turbo and Llama-3-70B demonstrate superior performance ontargeted information retrieval and fact-based reasoning, particularly when information gapsare well-defined and addressable through pattern matching or data retrieval.{\%}These findings identify key areas for future QA tasks and model development,highlighting the critical need for questions that not only challengehigher-order reasoning and scientific thinking, but also demand nuanced linguisticand cross-contextual application.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gor-etal-2024-great">
<titleInfo>
<title>Do great minds think alike? Investigating Human-AI Complementarity in Question Answering with CAIMIRA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maharshi</namePart>
<namePart type="family">Gor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hal</namePart>
<namePart type="family">Daumé Iii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements of large language models (LLMs)have led to claims of AI surpassing humansin natural language processing NLP tasks such as textual understanding and reasoning.%This work investigates these assertions by introducingCAIMIRA, a novel framework rooted in item response theory IRTthat enables quantitative assessment and comparison of problem-solving abilities inquestion-answering QA agents.%Through analysis of over 300,000 responses from ~ 70 AI systemsand 155 humans across thousands of quiz questions, CAIMIRA uncovers distinctproficiency patterns in knowledge domains and reasoning skills. %Humans outperform AI systems in knowledge-grounded abductive and conceptual reasoning,while state-of-the-art LLMs like GPT-4 Turbo and Llama-3-70B demonstrate superior performance ontargeted information retrieval and fact-based reasoning, particularly when information gapsare well-defined and addressable through pattern matching or data retrieval.%These findings identify key areas for future QA tasks and model development,highlighting the critical need for questions that not only challengehigher-order reasoning and scientific thinking, but also demand nuanced linguisticand cross-contextual application.</abstract>
<identifier type="citekey">gor-etal-2024-great</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1201</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21533</start>
<end>21564</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do great minds think alike? Investigating Human-AI Complementarity in Question Answering with CAIMIRA
%A Gor, Maharshi
%A Daumé Iii, Hal
%A Zhou, Tianyi
%A Boyd-Graber, Jordan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F gor-etal-2024-great
%X Recent advancements of large language models (LLMs)have led to claims of AI surpassing humansin natural language processing NLP tasks such as textual understanding and reasoning.%This work investigates these assertions by introducingCAIMIRA, a novel framework rooted in item response theory IRTthat enables quantitative assessment and comparison of problem-solving abilities inquestion-answering QA agents.%Through analysis of over 300,000 responses from ~ 70 AI systemsand 155 humans across thousands of quiz questions, CAIMIRA uncovers distinctproficiency patterns in knowledge domains and reasoning skills. %Humans outperform AI systems in knowledge-grounded abductive and conceptual reasoning,while state-of-the-art LLMs like GPT-4 Turbo and Llama-3-70B demonstrate superior performance ontargeted information retrieval and fact-based reasoning, particularly when information gapsare well-defined and addressable through pattern matching or data retrieval.%These findings identify key areas for future QA tasks and model development,highlighting the critical need for questions that not only challengehigher-order reasoning and scientific thinking, but also demand nuanced linguisticand cross-contextual application.
%U https://aclanthology.org/2024.emnlp-main.1201
%P 21533-21564
Markdown (Informal)
[Do great minds think alike? Investigating Human-AI Complementarity in Question Answering with CAIMIRA](https://aclanthology.org/2024.emnlp-main.1201) (Gor et al., EMNLP 2024)
ACL