@inproceedings{gobara-etal-2025-speaker,
title = "Speaker Identification and Dataset Construction Using {LLM}s: A Case Study on {J}apanese Narratives",
author = "Gobara, Seiji and
Kamigaito, Hidetaka and
Watanabe, Taro",
editor = "Clark, Elizabeth and
Lal, Yash Kumar and
Chaturvedi, Snigdha and
Iyyer, Mohit and
Brei, Anneliese and
Modi, Ashutosh and
Chandu, Khyathi Raghavi",
booktitle = "Proceedings of the The 7th Workshop on Narrative Understanding",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wnu-1.17/",
doi = "10.18653/v1/2025.wnu-1.17",
pages = "97--119",
ISBN = "979-8-89176-247-3",
abstract = "Speaker identification in narrative analysis is a challenging task due to complex dialogues, diverse utterance patterns, and ambiguous character references. Cosly and time-intensive manual annotation limits the scalability of high-quality dataset creation.This study demonstrates a cost-efficient approach of constructing speaker identification datasets by combining small-scale manual annotation with LLM-based labeling. A subset of data is manually annotated and is used to guide LLM predictions with a few-shot approach followed by refinement through minimal human corrections. Our results show that LLMs achieve approximately 90{\%} accuracy on challenging narratives, such as the ``Three Kingdoms'' dataset, underscoring the importance of targeted human corrections. This approach proves effective for constructing scalable and cost-efficient datasets for Japanese and complex narratives."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gobara-etal-2025-speaker">
<titleInfo>
<title>Speaker Identification and Dataset Construction Using LLMs: A Case Study on Japanese Narratives</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seiji</namePart>
<namePart type="family">Gobara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hidetaka</namePart>
<namePart type="family">Kamigaito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the The 7th Workshop on Narrative Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yash</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Lal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Snigdha</namePart>
<namePart type="family">Chaturvedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Iyyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anneliese</namePart>
<namePart type="family">Brei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashutosh</namePart>
<namePart type="family">Modi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="given">Raghavi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-247-3</identifier>
</relatedItem>
<abstract>Speaker identification in narrative analysis is a challenging task due to complex dialogues, diverse utterance patterns, and ambiguous character references. Cosly and time-intensive manual annotation limits the scalability of high-quality dataset creation.This study demonstrates a cost-efficient approach of constructing speaker identification datasets by combining small-scale manual annotation with LLM-based labeling. A subset of data is manually annotated and is used to guide LLM predictions with a few-shot approach followed by refinement through minimal human corrections. Our results show that LLMs achieve approximately 90% accuracy on challenging narratives, such as the “Three Kingdoms” dataset, underscoring the importance of targeted human corrections. This approach proves effective for constructing scalable and cost-efficient datasets for Japanese and complex narratives.</abstract>
<identifier type="citekey">gobara-etal-2025-speaker</identifier>
<identifier type="doi">10.18653/v1/2025.wnu-1.17</identifier>
<location>
<url>https://aclanthology.org/2025.wnu-1.17/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>97</start>
<end>119</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Speaker Identification and Dataset Construction Using LLMs: A Case Study on Japanese Narratives
%A Gobara, Seiji
%A Kamigaito, Hidetaka
%A Watanabe, Taro
%Y Clark, Elizabeth
%Y Lal, Yash Kumar
%Y Chaturvedi, Snigdha
%Y Iyyer, Mohit
%Y Brei, Anneliese
%Y Modi, Ashutosh
%Y Chandu, Khyathi Raghavi
%S Proceedings of the The 7th Workshop on Narrative Understanding
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-247-3
%F gobara-etal-2025-speaker
%X Speaker identification in narrative analysis is a challenging task due to complex dialogues, diverse utterance patterns, and ambiguous character references. Cosly and time-intensive manual annotation limits the scalability of high-quality dataset creation.This study demonstrates a cost-efficient approach of constructing speaker identification datasets by combining small-scale manual annotation with LLM-based labeling. A subset of data is manually annotated and is used to guide LLM predictions with a few-shot approach followed by refinement through minimal human corrections. Our results show that LLMs achieve approximately 90% accuracy on challenging narratives, such as the “Three Kingdoms” dataset, underscoring the importance of targeted human corrections. This approach proves effective for constructing scalable and cost-efficient datasets for Japanese and complex narratives.
%R 10.18653/v1/2025.wnu-1.17
%U https://aclanthology.org/2025.wnu-1.17/
%U https://doi.org/10.18653/v1/2025.wnu-1.17
%P 97-119
Markdown (Informal)
[Speaker Identification and Dataset Construction Using LLMs: A Case Study on Japanese Narratives](https://aclanthology.org/2025.wnu-1.17/) (Gobara et al., WNU 2025)
ACL