@article{mcmillan-major-etal-2024-documenting,
title = "Documenting Geographically and Contextually Diverse Language Data Sources",
author = "McMillan-Major, Angelina and
De Toni, Francesco and
Alyafeai, Zaid and
Biderman, Stella and
Chen, Kimbo and
Dupont, G{\'e}rard and
Elsahar, Hady and
Emezue, Chris and
Aji, Alham Fikri and
Ili{\'c}, Suzana and
Khamis, Nurulaqilla and
Leong, Colin and
Masoud, Maraim and
Soroa, Aitor and
Ortiz Suarez, Pedro and
van Strien, Daniel and
Talat, Zeerak and
Jernite, Yacine",
editor = "Bollmann, Marcel",
journal = "Northern European Journal of Language Technology",
volume = "10",
month = dec,
year = "2024",
address = {Link{\"o}ping, Sweden},
publisher = {Link{\"o}ping University Electronic Press},
url = "https://aclanthology.org/2024.nejlt-1.4/",
doi = "10.3384/nejlt.2000-1533.2024.5217",
pages = "50--77",
abstract = "Contemporary large-scale data collection efforts have prioritized the amount of data collected to improve large language models (LLM). This quantitative approach has resulted in concerns for the rights of data subjects represented in data collections. This concern is exacerbated by a lack of documentation and analysis tools, making it difficult to interrogate these collections. Mindful of these pitfalls, we present a methodology for documentation-first, human-centered data collection. We apply this approach in an effort to train a multilingual LLM. We identify a geographically diverse set of target language groups (Arabic varieties, Basque, Chinese varieties, Catalan, English, French, Indic languages, Indonesian, Niger-Congo languages, Portuguese, Spanish, and Vietnamese, as well as programming languages) for which to collect metadata on potential data sources. We structure this effort by developing an online catalogue in English as a tool for gathering metadata through public hackathons. We present our tool and analyses of the resulting resource metadata, including distributions over languages, regions, and resource types, and discuss our lessons learned."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mcmillan-major-etal-2024-documenting">
<titleInfo>
<title>Documenting Geographically and Contextually Diverse Language Data Sources</title>
</titleInfo>
<name type="personal">
<namePart type="given">Angelina</namePart>
<namePart type="family">McMillan-Major</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">De Toni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stella</namePart>
<namePart type="family">Biderman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kimbo</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gérard</namePart>
<namePart type="family">Dupont</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hady</namePart>
<namePart type="family">Elsahar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Emezue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suzana</namePart>
<namePart type="family">Ilić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nurulaqilla</namePart>
<namePart type="family">Khamis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Leong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maraim</namePart>
<namePart type="family">Masoud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aitor</namePart>
<namePart type="family">Soroa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Ortiz Suarez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">van Strien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeerak</namePart>
<namePart type="family">Talat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacine</namePart>
<namePart type="family">Jernite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Northern European Journal of Language Technology</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>Linköping University Electronic Press</publisher>
<place>
<placeTerm type="text">Linköping, Sweden</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Contemporary large-scale data collection efforts have prioritized the amount of data collected to improve large language models (LLM). This quantitative approach has resulted in concerns for the rights of data subjects represented in data collections. This concern is exacerbated by a lack of documentation and analysis tools, making it difficult to interrogate these collections. Mindful of these pitfalls, we present a methodology for documentation-first, human-centered data collection. We apply this approach in an effort to train a multilingual LLM. We identify a geographically diverse set of target language groups (Arabic varieties, Basque, Chinese varieties, Catalan, English, French, Indic languages, Indonesian, Niger-Congo languages, Portuguese, Spanish, and Vietnamese, as well as programming languages) for which to collect metadata on potential data sources. We structure this effort by developing an online catalogue in English as a tool for gathering metadata through public hackathons. We present our tool and analyses of the resulting resource metadata, including distributions over languages, regions, and resource types, and discuss our lessons learned.</abstract>
<identifier type="citekey">mcmillan-major-etal-2024-documenting</identifier>
<identifier type="doi">10.3384/nejlt.2000-1533.2024.5217</identifier>
<location>
<url>https://aclanthology.org/2024.nejlt-1.4/</url>
</location>
<part>
<date>2024-12</date>
<detail type="volume"><number>10</number></detail>
<extent unit="page">
<start>50</start>
<end>77</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Documenting Geographically and Contextually Diverse Language Data Sources
%A McMillan-Major, Angelina
%A De Toni, Francesco
%A Alyafeai, Zaid
%A Biderman, Stella
%A Chen, Kimbo
%A Dupont, Gérard
%A Elsahar, Hady
%A Emezue, Chris
%A Aji, Alham Fikri
%A Ilić, Suzana
%A Khamis, Nurulaqilla
%A Leong, Colin
%A Masoud, Maraim
%A Soroa, Aitor
%A Ortiz Suarez, Pedro
%A van Strien, Daniel
%A Talat, Zeerak
%A Jernite, Yacine
%J Northern European Journal of Language Technology
%D 2024
%8 December
%V 10
%I Linköping University Electronic Press
%C Linköping, Sweden
%F mcmillan-major-etal-2024-documenting
%X Contemporary large-scale data collection efforts have prioritized the amount of data collected to improve large language models (LLM). This quantitative approach has resulted in concerns for the rights of data subjects represented in data collections. This concern is exacerbated by a lack of documentation and analysis tools, making it difficult to interrogate these collections. Mindful of these pitfalls, we present a methodology for documentation-first, human-centered data collection. We apply this approach in an effort to train a multilingual LLM. We identify a geographically diverse set of target language groups (Arabic varieties, Basque, Chinese varieties, Catalan, English, French, Indic languages, Indonesian, Niger-Congo languages, Portuguese, Spanish, and Vietnamese, as well as programming languages) for which to collect metadata on potential data sources. We structure this effort by developing an online catalogue in English as a tool for gathering metadata through public hackathons. We present our tool and analyses of the resulting resource metadata, including distributions over languages, regions, and resource types, and discuss our lessons learned.
%R 10.3384/nejlt.2000-1533.2024.5217
%U https://aclanthology.org/2024.nejlt-1.4/
%U https://doi.org/10.3384/nejlt.2000-1533.2024.5217
%P 50-77
Markdown (Informal)
[Documenting Geographically and Contextually Diverse Language Data Sources](https://aclanthology.org/2024.nejlt-1.4/) (McMillan-Major et al., NEJLT 2024)
ACL
- Angelina McMillan-Major, Francesco De Toni, Zaid Alyafeai, Stella Biderman, Kimbo Chen, Gérard Dupont, Hady Elsahar, Chris Emezue, Alham Fikri Aji, Suzana Ilić, Nurulaqilla Khamis, Colin Leong, Maraim Masoud, Aitor Soroa, Pedro Ortiz Suarez, Daniel van Strien, Zeerak Talat, and Yacine Jernite. 2024. Documenting Geographically and Contextually Diverse Language Data Sources. Northern European Journal of Language Technology, 10:50–77.