@inproceedings{anantharama-etal-2022-canarex,
title = "{CAN}ar{E}x: Contextually Aware Narrative Extraction for Semantically Rich Text-as-data Applications",
author = "Anantharama, Nandini and
Angus, Simon and
O{'}Neill, Lachlan",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.260",
doi = "10.18653/v1/2022.findings-emnlp.260",
pages = "3551--3564",
abstract = "Narrative modelling is an area of active research, motivated by the acknowledgement of narratives as drivers of societal decision making. These research efforts conceptualize narratives as connected entity chains, and modeling typically focuses on the identification of entities and their connections within a text. An emerging approach to narrative modelling is the use of semantic role labeling (SRL) to extract Entity-Verb-Entity (E-V-Es) tuples from a text, followed by dimensionality reduction to reduce the space of entities and connections separately. This process penalises the semantic richness of narratives and discards much contextual information along the way. Here, we propose an alternate narrative extraction approach - CANarEx, incorporating a pipeline of common contextual constructs through co-reference resolution, micro-narrative generation and clustering of these narratives through sentence embeddings. We evaluate our approach through testing the recovery of {``}narrative time-series clusters{''}, mimicking a desirable text-as-data task. The evaluation framework leverages synthetic data generated using a GPT-3 model. The GPT-3 model is trained to generate similar sentences using a large dataset of news articles. The synthetic data maps to three topics in the news dataset. We then generate narrative time-series document cluster representations by mapping the synthetic data to three distinct signals synthetically injected into the testing corpus. Evaluation results demonstrate the superior ability of CANarEx to recover narrative time-series through reduced MSE and improved precision/recall relative to existing methods. The validity is further reinforced through ablation studies and qualitative analysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anantharama-etal-2022-canarex">
<titleInfo>
<title>CANarEx: Contextually Aware Narrative Extraction for Semantically Rich Text-as-data Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nandini</namePart>
<namePart type="family">Anantharama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Angus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lachlan</namePart>
<namePart type="family">O’Neill</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Narrative modelling is an area of active research, motivated by the acknowledgement of narratives as drivers of societal decision making. These research efforts conceptualize narratives as connected entity chains, and modeling typically focuses on the identification of entities and their connections within a text. An emerging approach to narrative modelling is the use of semantic role labeling (SRL) to extract Entity-Verb-Entity (E-V-Es) tuples from a text, followed by dimensionality reduction to reduce the space of entities and connections separately. This process penalises the semantic richness of narratives and discards much contextual information along the way. Here, we propose an alternate narrative extraction approach - CANarEx, incorporating a pipeline of common contextual constructs through co-reference resolution, micro-narrative generation and clustering of these narratives through sentence embeddings. We evaluate our approach through testing the recovery of “narrative time-series clusters”, mimicking a desirable text-as-data task. The evaluation framework leverages synthetic data generated using a GPT-3 model. The GPT-3 model is trained to generate similar sentences using a large dataset of news articles. The synthetic data maps to three topics in the news dataset. We then generate narrative time-series document cluster representations by mapping the synthetic data to three distinct signals synthetically injected into the testing corpus. Evaluation results demonstrate the superior ability of CANarEx to recover narrative time-series through reduced MSE and improved precision/recall relative to existing methods. The validity is further reinforced through ablation studies and qualitative analysis.</abstract>
<identifier type="citekey">anantharama-etal-2022-canarex</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.260</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.260</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>3551</start>
<end>3564</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CANarEx: Contextually Aware Narrative Extraction for Semantically Rich Text-as-data Applications
%A Anantharama, Nandini
%A Angus, Simon
%A O’Neill, Lachlan
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F anantharama-etal-2022-canarex
%X Narrative modelling is an area of active research, motivated by the acknowledgement of narratives as drivers of societal decision making. These research efforts conceptualize narratives as connected entity chains, and modeling typically focuses on the identification of entities and their connections within a text. An emerging approach to narrative modelling is the use of semantic role labeling (SRL) to extract Entity-Verb-Entity (E-V-Es) tuples from a text, followed by dimensionality reduction to reduce the space of entities and connections separately. This process penalises the semantic richness of narratives and discards much contextual information along the way. Here, we propose an alternate narrative extraction approach - CANarEx, incorporating a pipeline of common contextual constructs through co-reference resolution, micro-narrative generation and clustering of these narratives through sentence embeddings. We evaluate our approach through testing the recovery of “narrative time-series clusters”, mimicking a desirable text-as-data task. The evaluation framework leverages synthetic data generated using a GPT-3 model. The GPT-3 model is trained to generate similar sentences using a large dataset of news articles. The synthetic data maps to three topics in the news dataset. We then generate narrative time-series document cluster representations by mapping the synthetic data to three distinct signals synthetically injected into the testing corpus. Evaluation results demonstrate the superior ability of CANarEx to recover narrative time-series through reduced MSE and improved precision/recall relative to existing methods. The validity is further reinforced through ablation studies and qualitative analysis.
%R 10.18653/v1/2022.findings-emnlp.260
%U https://aclanthology.org/2022.findings-emnlp.260
%U https://doi.org/10.18653/v1/2022.findings-emnlp.260
%P 3551-3564
Markdown (Informal)
[CANarEx: Contextually Aware Narrative Extraction for Semantically Rich Text-as-data Applications](https://aclanthology.org/2022.findings-emnlp.260) (Anantharama et al., Findings 2022)
ACL