@inproceedings{johnson-etal-2024-wikimedia,
title = "Wikimedia data for {AI}: a review of Wikimedia datasets for {NLP} tasks and {AI}-assisted editing",
author = "Johnson, Isaac and
Kaffee, Lucie-Aim{\'e}e and
Redi, Miriam",
editor = "Lucie-Aim{\'e}e, Lucie and
Fan, Angela and
Gwadabe, Tajuddeen and
Johnson, Isaac and
Petroni, Fabio and
van Strien, Daniel",
booktitle = "Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wikinlp-1.14",
pages = "91--101",
abstract = "Wikimedia content is used extensively by the AI community and within the language modeling community in particular. In this paper, we provide a review of the different ways in which Wikimedia data is curated to use in NLP tasks across pre-training, post-training, and model evaluations. We point to opportunities for greater use of Wikimedia content but also identify ways in which the language modeling community could better center the needs of Wikimedia editors. In particular, we call for incorporating additional sources of Wikimedia data, a greater focus on benchmarks for LLMs that encode Wikimedia principles, and greater multilingualism in Wikimedia-derived datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="johnson-etal-2024-wikimedia">
<titleInfo>
<title>Wikimedia data for AI: a review of Wikimedia datasets for NLP tasks and AI-assisted editing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie-Aimée</namePart>
<namePart type="family">Kaffee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miriam</namePart>
<namePart type="family">Redi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Lucie-Aimée</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tajuddeen</namePart>
<namePart type="family">Gwadabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Petroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">van Strien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Wikimedia content is used extensively by the AI community and within the language modeling community in particular. In this paper, we provide a review of the different ways in which Wikimedia data is curated to use in NLP tasks across pre-training, post-training, and model evaluations. We point to opportunities for greater use of Wikimedia content but also identify ways in which the language modeling community could better center the needs of Wikimedia editors. In particular, we call for incorporating additional sources of Wikimedia data, a greater focus on benchmarks for LLMs that encode Wikimedia principles, and greater multilingualism in Wikimedia-derived datasets.</abstract>
<identifier type="citekey">johnson-etal-2024-wikimedia</identifier>
<location>
<url>https://aclanthology.org/2024.wikinlp-1.14</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>91</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Wikimedia data for AI: a review of Wikimedia datasets for NLP tasks and AI-assisted editing
%A Johnson, Isaac
%A Kaffee, Lucie-Aimée
%A Redi, Miriam
%Y Lucie-Aimée, Lucie
%Y Fan, Angela
%Y Gwadabe, Tajuddeen
%Y Johnson, Isaac
%Y Petroni, Fabio
%Y van Strien, Daniel
%S Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F johnson-etal-2024-wikimedia
%X Wikimedia content is used extensively by the AI community and within the language modeling community in particular. In this paper, we provide a review of the different ways in which Wikimedia data is curated to use in NLP tasks across pre-training, post-training, and model evaluations. We point to opportunities for greater use of Wikimedia content but also identify ways in which the language modeling community could better center the needs of Wikimedia editors. In particular, we call for incorporating additional sources of Wikimedia data, a greater focus on benchmarks for LLMs that encode Wikimedia principles, and greater multilingualism in Wikimedia-derived datasets.
%U https://aclanthology.org/2024.wikinlp-1.14
%P 91-101
Markdown (Informal)
[Wikimedia data for AI: a review of Wikimedia datasets for NLP tasks and AI-assisted editing](https://aclanthology.org/2024.wikinlp-1.14) (Johnson et al., WikiNLP 2024)
ACL