@inproceedings{li-etal-2024-newsbench,
title = "{N}ews{B}ench: A Systematic Evaluation Framework for Assessing Editorial Capabilities of Large Language Models in {C}hinese Journalism",
author = "Li, Miao and
Chen, Ming-Bin and
Tang, Bo and
ShengbinHou, ShengbinHou and
Wang, Pengyu and
Deng, Haiying and
Li, Zhiyu and
Xiong, Feiyu and
Mao, Keming and
Peng, Cheng and
Luo, Yi",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.538/",
doi = "10.18653/v1/2024.acl-long.538",
pages = "9993--10014",
abstract = "We present NewsBench, a novel evaluation framework to systematically assess the capabilities of Large Language Models (LLMs) for editorial capabilities in Chinese journalism. Our constructed benchmark dataset is focused on four facets of writing proficiency and six facets of safety adherence, and it comprises manually and carefully designed 1,267 test samples in the types of multiple choice questions and short answer questions for five editorial tasks in 24 news domains. To measure performances, we propose different GPT-4 based automatic evaluation protocols to assess LLM generations for short answer questions in terms of writing proficiency and safety adherence, and both are validated by the high correlations with human evaluations. Based on the systematic evaluation framework, we conduct a comprehensive analysis of eleven popular LLMs which can handle Chinese. The experimental results highlight GPT-4 and ERNIE Bot as top performers, yet reveal a relative deficiency in journalistic safety adherence in creative writing tasks. Our findings also underscore the need for enhanced ethical guidance in machine-generated journalistic content, marking a step forward in aligning LLMs with journalistic standards and safety considerations. The evaluation framework and experimental results are expected to provide an in-depth understanding of the editorial capabilities of LLMs and speed up the development of LLMs in journalism."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-newsbench">
<titleInfo>
<title>NewsBench: A Systematic Evaluation Framework for Assessing Editorial Capabilities of Large Language Models in Chinese Journalism</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming-Bin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ShengbinHou</namePart>
<namePart type="family">ShengbinHou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haiying</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feiyu</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keming</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present NewsBench, a novel evaluation framework to systematically assess the capabilities of Large Language Models (LLMs) for editorial capabilities in Chinese journalism. Our constructed benchmark dataset is focused on four facets of writing proficiency and six facets of safety adherence, and it comprises manually and carefully designed 1,267 test samples in the types of multiple choice questions and short answer questions for five editorial tasks in 24 news domains. To measure performances, we propose different GPT-4 based automatic evaluation protocols to assess LLM generations for short answer questions in terms of writing proficiency and safety adherence, and both are validated by the high correlations with human evaluations. Based on the systematic evaluation framework, we conduct a comprehensive analysis of eleven popular LLMs which can handle Chinese. The experimental results highlight GPT-4 and ERNIE Bot as top performers, yet reveal a relative deficiency in journalistic safety adherence in creative writing tasks. Our findings also underscore the need for enhanced ethical guidance in machine-generated journalistic content, marking a step forward in aligning LLMs with journalistic standards and safety considerations. The evaluation framework and experimental results are expected to provide an in-depth understanding of the editorial capabilities of LLMs and speed up the development of LLMs in journalism.</abstract>
<identifier type="citekey">li-etal-2024-newsbench</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.538</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.538/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>9993</start>
<end>10014</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NewsBench: A Systematic Evaluation Framework for Assessing Editorial Capabilities of Large Language Models in Chinese Journalism
%A Li, Miao
%A Chen, Ming-Bin
%A Tang, Bo
%A ShengbinHou, ShengbinHou
%A Wang, Pengyu
%A Deng, Haiying
%A Li, Zhiyu
%A Xiong, Feiyu
%A Mao, Keming
%A Peng, Cheng
%A Luo, Yi
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F li-etal-2024-newsbench
%X We present NewsBench, a novel evaluation framework to systematically assess the capabilities of Large Language Models (LLMs) for editorial capabilities in Chinese journalism. Our constructed benchmark dataset is focused on four facets of writing proficiency and six facets of safety adherence, and it comprises manually and carefully designed 1,267 test samples in the types of multiple choice questions and short answer questions for five editorial tasks in 24 news domains. To measure performances, we propose different GPT-4 based automatic evaluation protocols to assess LLM generations for short answer questions in terms of writing proficiency and safety adherence, and both are validated by the high correlations with human evaluations. Based on the systematic evaluation framework, we conduct a comprehensive analysis of eleven popular LLMs which can handle Chinese. The experimental results highlight GPT-4 and ERNIE Bot as top performers, yet reveal a relative deficiency in journalistic safety adherence in creative writing tasks. Our findings also underscore the need for enhanced ethical guidance in machine-generated journalistic content, marking a step forward in aligning LLMs with journalistic standards and safety considerations. The evaluation framework and experimental results are expected to provide an in-depth understanding of the editorial capabilities of LLMs and speed up the development of LLMs in journalism.
%R 10.18653/v1/2024.acl-long.538
%U https://aclanthology.org/2024.luhme-long.538/
%U https://doi.org/10.18653/v1/2024.acl-long.538
%P 9993-10014
Markdown (Informal)
[NewsBench: A Systematic Evaluation Framework for Assessing Editorial Capabilities of Large Language Models in Chinese Journalism](https://aclanthology.org/2024.luhme-long.538/) (Li et al., ACL 2024)
ACL
- Miao Li, Ming-Bin Chen, Bo Tang, ShengbinHou ShengbinHou, Pengyu Wang, Haiying Deng, Zhiyu Li, Feiyu Xiong, Keming Mao, Cheng Peng, and Yi Luo. 2024. NewsBench: A Systematic Evaluation Framework for Assessing Editorial Capabilities of Large Language Models in Chinese Journalism. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 9993–10014, Bangkok, Thailand. Association for Computational Linguistics.