@inproceedings{dejl-etal-2026-comprehensiveness,
title = "Comprehensiveness Metrics for Automatic Evaluation of Factual Recall in Text Generation",
author = "Dejl, Adam and
Barry, James and
Pascale, Alessandra and
Carnerero-Cano, Javier",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1744/",
pages = "34931--34966",
ISBN = "979-8-89176-395-1",
abstract = "Despite demonstrating remarkable performance across a wide range of tasks, large language models (LLMs) have also been found to frequently produce outputs that are incomplete or selectively omit key information. In sensitive domains, such omissions can result in significant harm comparable to that posed by factual inaccuracies, including hallucinations. In this study, we address the challenge of evaluating the comprehensiveness of LLM-generated texts, focusing on the detection of missing information or underrepresented viewpoints. We investigate three automated evaluation metrics: (1) an NLI-based method that decomposes texts into atomic statements and uses natural language inference (NLI) to identify missing facts, (2) a Q A-based metric that extracts question-answer pairs and compares responses across sources, and (3) an end-to-end approach that directly identifies missing content using LLMs. Our experiments demonstrate the surprising effectiveness of the simple end-to-end metric compared to more complex metrics, though at the cost of reduced robustness, interpretability and result granularity. We further assess the comprehensiveness of responses from several popular open-weight LLMs when answering user queries based on multiple sources."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dejl-etal-2026-comprehensiveness">
<titleInfo>
<title>Comprehensiveness Metrics for Automatic Evaluation of Factual Recall in Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Dejl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Barry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandra</namePart>
<namePart type="family">Pascale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">Carnerero-Cano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Despite demonstrating remarkable performance across a wide range of tasks, large language models (LLMs) have also been found to frequently produce outputs that are incomplete or selectively omit key information. In sensitive domains, such omissions can result in significant harm comparable to that posed by factual inaccuracies, including hallucinations. In this study, we address the challenge of evaluating the comprehensiveness of LLM-generated texts, focusing on the detection of missing information or underrepresented viewpoints. We investigate three automated evaluation metrics: (1) an NLI-based method that decomposes texts into atomic statements and uses natural language inference (NLI) to identify missing facts, (2) a Q A-based metric that extracts question-answer pairs and compares responses across sources, and (3) an end-to-end approach that directly identifies missing content using LLMs. Our experiments demonstrate the surprising effectiveness of the simple end-to-end metric compared to more complex metrics, though at the cost of reduced robustness, interpretability and result granularity. We further assess the comprehensiveness of responses from several popular open-weight LLMs when answering user queries based on multiple sources.</abstract>
<identifier type="citekey">dejl-etal-2026-comprehensiveness</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1744/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>34931</start>
<end>34966</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comprehensiveness Metrics for Automatic Evaluation of Factual Recall in Text Generation
%A Dejl, Adam
%A Barry, James
%A Pascale, Alessandra
%A Carnerero-Cano, Javier
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F dejl-etal-2026-comprehensiveness
%X Despite demonstrating remarkable performance across a wide range of tasks, large language models (LLMs) have also been found to frequently produce outputs that are incomplete or selectively omit key information. In sensitive domains, such omissions can result in significant harm comparable to that posed by factual inaccuracies, including hallucinations. In this study, we address the challenge of evaluating the comprehensiveness of LLM-generated texts, focusing on the detection of missing information or underrepresented viewpoints. We investigate three automated evaluation metrics: (1) an NLI-based method that decomposes texts into atomic statements and uses natural language inference (NLI) to identify missing facts, (2) a Q A-based metric that extracts question-answer pairs and compares responses across sources, and (3) an end-to-end approach that directly identifies missing content using LLMs. Our experiments demonstrate the surprising effectiveness of the simple end-to-end metric compared to more complex metrics, though at the cost of reduced robustness, interpretability and result granularity. We further assess the comprehensiveness of responses from several popular open-weight LLMs when answering user queries based on multiple sources.
%U https://aclanthology.org/2026.findings-acl.1744/
%P 34931-34966
Markdown (Informal)
[Comprehensiveness Metrics for Automatic Evaluation of Factual Recall in Text Generation](https://aclanthology.org/2026.findings-acl.1744/) (Dejl et al., Findings 2026)
ACL