@inproceedings{spangher-etal-2024-explaining,
title = "Explaining Mixtures of Sources in News Articles",
author = "Spangher, Alexander and
Youn, James and
DeButts, Matt and
Peng, Nanyun and
Ferrara, Emilio and
May, Jonathan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.930",
doi = "10.18653/v1/2024.findings-emnlp.930",
pages = "15837--15859",
abstract = "Human writers plan, {\_}then{\_} write. For large language models (LLMs) to play a role in longer-form article generation, we must understand the planning steps humans make before writing. We explore one kind of planning, source-selection in news, as a case-study for evaluating plans in long-form generation. We ask: why do {\_}specific{\_} stories call for {\_}specific{\_} kinds of sources? We imagine a generative process for story writing where a source-selection schema is first selected by a journalist, and then sources are chosen based on categories in that schema. Learning the article{'}s {\_}plan{\_} means predicting the schema initially chosen by the journalist. Working with professional journalists, we adapt five existing schemata and introduce three new ones to describe journalistic plans for the inclusion of sources in documents. Then, inspired by Bayesian latent-variable modeling, we develop metrics to select the most likely plan, or schema, underlying a story, which we use to compare schemata. We find that two schemata: {\_}stance{\_} and {\_}social affiliation{\_} best explain source plans in most documents. However, other schemata like {\_}textual entailment{\_} explain source plans in factually rich topics like {``}Science{''}. Finally, we find we can predict the most suitable schema given just the article{'}s headline with reasonable accuracy. We see this as an important case-study for human planning, and provides a framework and approach for evaluating other kinds of plans, like discourse or plot-oriented plans. We release a corpora, {\_}NewsSources{\_}, with annotations for 4M articles, for further study.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="spangher-etal-2024-explaining">
<titleInfo>
<title>Explaining Mixtures of Sources in News Articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Spangher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Youn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">DeButts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emilio</namePart>
<namePart type="family">Ferrara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">May</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human writers plan, _then_ write. For large language models (LLMs) to play a role in longer-form article generation, we must understand the planning steps humans make before writing. We explore one kind of planning, source-selection in news, as a case-study for evaluating plans in long-form generation. We ask: why do _specific_ stories call for _specific_ kinds of sources? We imagine a generative process for story writing where a source-selection schema is first selected by a journalist, and then sources are chosen based on categories in that schema. Learning the article’s _plan_ means predicting the schema initially chosen by the journalist. Working with professional journalists, we adapt five existing schemata and introduce three new ones to describe journalistic plans for the inclusion of sources in documents. Then, inspired by Bayesian latent-variable modeling, we develop metrics to select the most likely plan, or schema, underlying a story, which we use to compare schemata. We find that two schemata: _stance_ and _social affiliation_ best explain source plans in most documents. However, other schemata like _textual entailment_ explain source plans in factually rich topics like “Science”. Finally, we find we can predict the most suitable schema given just the article’s headline with reasonable accuracy. We see this as an important case-study for human planning, and provides a framework and approach for evaluating other kinds of plans, like discourse or plot-oriented plans. We release a corpora, _NewsSources_, with annotations for 4M articles, for further study.</abstract>
<identifier type="citekey">spangher-etal-2024-explaining</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.930</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.930</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>15837</start>
<end>15859</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Explaining Mixtures of Sources in News Articles
%A Spangher, Alexander
%A Youn, James
%A DeButts, Matt
%A Peng, Nanyun
%A Ferrara, Emilio
%A May, Jonathan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F spangher-etal-2024-explaining
%X Human writers plan, _then_ write. For large language models (LLMs) to play a role in longer-form article generation, we must understand the planning steps humans make before writing. We explore one kind of planning, source-selection in news, as a case-study for evaluating plans in long-form generation. We ask: why do _specific_ stories call for _specific_ kinds of sources? We imagine a generative process for story writing where a source-selection schema is first selected by a journalist, and then sources are chosen based on categories in that schema. Learning the article’s _plan_ means predicting the schema initially chosen by the journalist. Working with professional journalists, we adapt five existing schemata and introduce three new ones to describe journalistic plans for the inclusion of sources in documents. Then, inspired by Bayesian latent-variable modeling, we develop metrics to select the most likely plan, or schema, underlying a story, which we use to compare schemata. We find that two schemata: _stance_ and _social affiliation_ best explain source plans in most documents. However, other schemata like _textual entailment_ explain source plans in factually rich topics like “Science”. Finally, we find we can predict the most suitable schema given just the article’s headline with reasonable accuracy. We see this as an important case-study for human planning, and provides a framework and approach for evaluating other kinds of plans, like discourse or plot-oriented plans. We release a corpora, _NewsSources_, with annotations for 4M articles, for further study.
%R 10.18653/v1/2024.findings-emnlp.930
%U https://aclanthology.org/2024.findings-emnlp.930
%U https://doi.org/10.18653/v1/2024.findings-emnlp.930
%P 15837-15859
Markdown (Informal)
[Explaining Mixtures of Sources in News Articles](https://aclanthology.org/2024.findings-emnlp.930) (Spangher et al., Findings 2024)
ACL
- Alexander Spangher, James Youn, Matt DeButts, Nanyun Peng, Emilio Ferrara, and Jonathan May. 2024. Explaining Mixtures of Sources in News Articles. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 15837–15859, Miami, Florida, USA. Association for Computational Linguistics.