@inproceedings{masano-etal-2026-constructing,
title = "Constructing a {J}apanese Verdict Prediction Dataset for Fact-Checking of {LLM}-Generated Texts",
author = "Masano, Miwa and
Kiyomaru, Hirokazu and
Keyaki, Atsushi and
Horio, Kaito and
Minamoto, Rei and
Keyaki, Ribeka and
Nakayama, Kouta and
Tachibana, Hideyuki and
Kawahara, Daisuke",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.99/",
pages = "1139--1151",
ISBN = "979-8-89176-393-7",
abstract = "The development of fact-checking systems for verifying the factuality of text generated by large language models (LLMs) has been advancing.In the verdict prediction step of such systems, the system determines whether claims in the generated text are supported by retrieved evidence, formulated as a natural language inference (NLI) task.This study extends the label set for verdict prediction to capture claim-evidence relationships that humans would commonly interpret as supported or refuted, even in the absence of strict logical entailment or contradiction.It also constructs a Japanese dataset comprising 28,147 instances from two sources based on this extended label set.We analyze the causes of annotation disagreement and find that ambiguity in the boundary of acceptable inference, interpretive characteristics of negative cases, and incomplete information in the evidence affect annotation variability.Using this dataset, we evaluate the performance of prompt-based verdict prediction methods and show that prompts that explicitly elicit chain-of-thought reasoning improve F1 by 4 percentage points compared to baseline."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="masano-etal-2026-constructing">
<titleInfo>
<title>Constructing a Japanese Verdict Prediction Dataset for Fact-Checking of LLM-Generated Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miwa</namePart>
<namePart type="family">Masano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hirokazu</namePart>
<namePart type="family">Kiyomaru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsushi</namePart>
<namePart type="family">Keyaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaito</namePart>
<namePart type="family">Horio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rei</namePart>
<namePart type="family">Minamoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ribeka</namePart>
<namePart type="family">Keyaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kouta</namePart>
<namePart type="family">Nakayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hideyuki</namePart>
<namePart type="family">Tachibana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisuke</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>The development of fact-checking systems for verifying the factuality of text generated by large language models (LLMs) has been advancing.In the verdict prediction step of such systems, the system determines whether claims in the generated text are supported by retrieved evidence, formulated as a natural language inference (NLI) task.This study extends the label set for verdict prediction to capture claim-evidence relationships that humans would commonly interpret as supported or refuted, even in the absence of strict logical entailment or contradiction.It also constructs a Japanese dataset comprising 28,147 instances from two sources based on this extended label set.We analyze the causes of annotation disagreement and find that ambiguity in the boundary of acceptable inference, interpretive characteristics of negative cases, and incomplete information in the evidence affect annotation variability.Using this dataset, we evaluate the performance of prompt-based verdict prediction methods and show that prompts that explicitly elicit chain-of-thought reasoning improve F1 by 4 percentage points compared to baseline.</abstract>
<identifier type="citekey">masano-etal-2026-constructing</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.99/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1139</start>
<end>1151</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Constructing a Japanese Verdict Prediction Dataset for Fact-Checking of LLM-Generated Texts
%A Masano, Miwa
%A Kiyomaru, Hirokazu
%A Keyaki, Atsushi
%A Horio, Kaito
%A Minamoto, Rei
%A Keyaki, Ribeka
%A Nakayama, Kouta
%A Tachibana, Hideyuki
%A Kawahara, Daisuke
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F masano-etal-2026-constructing
%X The development of fact-checking systems for verifying the factuality of text generated by large language models (LLMs) has been advancing.In the verdict prediction step of such systems, the system determines whether claims in the generated text are supported by retrieved evidence, formulated as a natural language inference (NLI) task.This study extends the label set for verdict prediction to capture claim-evidence relationships that humans would commonly interpret as supported or refuted, even in the absence of strict logical entailment or contradiction.It also constructs a Japanese dataset comprising 28,147 instances from two sources based on this extended label set.We analyze the causes of annotation disagreement and find that ambiguity in the boundary of acceptable inference, interpretive characteristics of negative cases, and incomplete information in the evidence affect annotation variability.Using this dataset, we evaluate the performance of prompt-based verdict prediction methods and show that prompts that explicitly elicit chain-of-thought reasoning improve F1 by 4 percentage points compared to baseline.
%U https://aclanthology.org/2026.acl-srw.99/
%P 1139-1151
Markdown (Informal)
[Constructing a Japanese Verdict Prediction Dataset for Fact-Checking of LLM-Generated Texts](https://aclanthology.org/2026.acl-srw.99/) (Masano et al., ACL 2026)
ACL
- Miwa Masano, Hirokazu Kiyomaru, Atsushi Keyaki, Kaito Horio, Rei Minamoto, Ribeka Keyaki, Kouta Nakayama, Hideyuki Tachibana, and Daisuke Kawahara. 2026. Constructing a Japanese Verdict Prediction Dataset for Fact-Checking of LLM-Generated Texts. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1139–1151, San Diego, California, United States. Association for Computational Linguistics.