@inproceedings{dwivedi-yu-etal-2024-editeval,
title = "{E}dit{E}val: An Instruction-Based Benchmark for Text Improvements",
author = "Dwivedi-Yu, Jane and
Schick, Timo and
Jiang, Zhengbao and
Lomeli, Maria and
Lewis, Patrick and
Izacard, Gautier and
Grave, Edouard and
Riedel, Sebastian and
Petroni, Fabio",
editor = "Barak, Libby and
Alikhani, Malihe",
booktitle = "Proceedings of the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-1.7",
pages = "69--83",
abstract = "Evaluation of text generation to date has primarily focused on content created sequentially, rather than improvements on a piece of text. Writing, however, is naturally an iterative and incremental process that requires expertise in different modular skills such as fixing outdated information or making the writing style more consistent. Even so, comprehensive evaluation of a model{'}s capacity to perform these skills and the ability to edit remains sparse. This work introduces EditEval: An instruction-based, benchmark and evaluation suite that leverages high-quality existing and new datasets in English for the automatic evaluation of editing capabilities, such as making text more cohesive and paraphrasing. We evaluate several pre-trained models, which shows that InstructGPT and PEER on average perform the best, but that most baselines fall below the supervised state-of-the-art, particularly when neutralizing and updating information. Our analysis also shows that commonly used metrics for editing tasks do not always correlate well, and that prompts leading to the strongest performance do not necessarily elicit strong performance across different models. Through the release of this benchmark (code and data available at https://github.com/facebookresearch/EditEval) and a publicly available leaderboard challenge, we hope to unlock future work on developing models more capable of controllable and iterative editing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dwivedi-yu-etal-2024-editeval">
<titleInfo>
<title>EditEval: An Instruction-Based Benchmark for Text Improvements</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jane</namePart>
<namePart type="family">Dwivedi-Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timo</namePart>
<namePart type="family">Schick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengbao</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Lomeli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gautier</namePart>
<namePart type="family">Izacard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edouard</namePart>
<namePart type="family">Grave</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Riedel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Petroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Libby</namePart>
<namePart type="family">Barak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation of text generation to date has primarily focused on content created sequentially, rather than improvements on a piece of text. Writing, however, is naturally an iterative and incremental process that requires expertise in different modular skills such as fixing outdated information or making the writing style more consistent. Even so, comprehensive evaluation of a model’s capacity to perform these skills and the ability to edit remains sparse. This work introduces EditEval: An instruction-based, benchmark and evaluation suite that leverages high-quality existing and new datasets in English for the automatic evaluation of editing capabilities, such as making text more cohesive and paraphrasing. We evaluate several pre-trained models, which shows that InstructGPT and PEER on average perform the best, but that most baselines fall below the supervised state-of-the-art, particularly when neutralizing and updating information. Our analysis also shows that commonly used metrics for editing tasks do not always correlate well, and that prompts leading to the strongest performance do not necessarily elicit strong performance across different models. Through the release of this benchmark (code and data available at https://github.com/facebookresearch/EditEval) and a publicly available leaderboard challenge, we hope to unlock future work on developing models more capable of controllable and iterative editing.</abstract>
<identifier type="citekey">dwivedi-yu-etal-2024-editeval</identifier>
<location>
<url>https://aclanthology.org/2024.conll-1.7</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>69</start>
<end>83</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EditEval: An Instruction-Based Benchmark for Text Improvements
%A Dwivedi-Yu, Jane
%A Schick, Timo
%A Jiang, Zhengbao
%A Lomeli, Maria
%A Lewis, Patrick
%A Izacard, Gautier
%A Grave, Edouard
%A Riedel, Sebastian
%A Petroni, Fabio
%Y Barak, Libby
%Y Alikhani, Malihe
%S Proceedings of the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F dwivedi-yu-etal-2024-editeval
%X Evaluation of text generation to date has primarily focused on content created sequentially, rather than improvements on a piece of text. Writing, however, is naturally an iterative and incremental process that requires expertise in different modular skills such as fixing outdated information or making the writing style more consistent. Even so, comprehensive evaluation of a model’s capacity to perform these skills and the ability to edit remains sparse. This work introduces EditEval: An instruction-based, benchmark and evaluation suite that leverages high-quality existing and new datasets in English for the automatic evaluation of editing capabilities, such as making text more cohesive and paraphrasing. We evaluate several pre-trained models, which shows that InstructGPT and PEER on average perform the best, but that most baselines fall below the supervised state-of-the-art, particularly when neutralizing and updating information. Our analysis also shows that commonly used metrics for editing tasks do not always correlate well, and that prompts leading to the strongest performance do not necessarily elicit strong performance across different models. Through the release of this benchmark (code and data available at https://github.com/facebookresearch/EditEval) and a publicly available leaderboard challenge, we hope to unlock future work on developing models more capable of controllable and iterative editing.
%U https://aclanthology.org/2024.conll-1.7
%P 69-83
Markdown (Informal)
[EditEval: An Instruction-Based Benchmark for Text Improvements](https://aclanthology.org/2024.conll-1.7) (Dwivedi-Yu et al., CoNLL 2024)
ACL
- Jane Dwivedi-Yu, Timo Schick, Zhengbao Jiang, Maria Lomeli, Patrick Lewis, Gautier Izacard, Edouard Grave, Sebastian Riedel, and Fabio Petroni. 2024. EditEval: An Instruction-Based Benchmark for Text Improvements. In Proceedings of the 28th Conference on Computational Natural Language Learning, pages 69–83, Miami, FL, USA. Association for Computational Linguistics.