@inproceedings{ogundepo-etal-2022-africlirmatrix,
title = "{A}fri{CLIRM}atrix: Enabling Cross-Lingual Information Retrieval for {A}frican Languages",
author = "Ogundepo, Odunayo and
Zhang, Xinyu and
Sun, Shuo and
Duh, Kevin and
Lin, Jimmy",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.597/",
doi = "10.18653/v1/2022.emnlp-main.597",
pages = "8721--8728",
abstract = "Language diversity in NLP is critical in enabling the development of tools for a wide range of users.However, there are limited resources for building such tools for many languages, particularly those spoken in Africa.For search, most existing datasets feature few or no African languages, directly impacting researchers' ability to build and improve information access capabilities in those languages.Motivated by this, we created AfriCLIRMatrix, a test collection for cross-lingual information retrieval research in 15 diverse African languages.In total, our dataset contains 6 million queries in English and 23 million relevance judgments automatically mined from Wikipedia inter-language links, covering many more African languages than any existing information retrieval test collection.In addition, we release BM25, dense retrieval, and sparse{--}dense hybrid baselines to provide a starting point for the development of future systems.We hope that these efforts can spur additional work in search for African languages.AfriCLIRMatrix can be downloaded at https://github.com/castorini/africlirmatrix."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ogundepo-etal-2022-africlirmatrix">
<titleInfo>
<title>AfriCLIRMatrix: Enabling Cross-Lingual Information Retrieval for African Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Odunayo</namePart>
<namePart type="family">Ogundepo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuo</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language diversity in NLP is critical in enabling the development of tools for a wide range of users.However, there are limited resources for building such tools for many languages, particularly those spoken in Africa.For search, most existing datasets feature few or no African languages, directly impacting researchers’ ability to build and improve information access capabilities in those languages.Motivated by this, we created AfriCLIRMatrix, a test collection for cross-lingual information retrieval research in 15 diverse African languages.In total, our dataset contains 6 million queries in English and 23 million relevance judgments automatically mined from Wikipedia inter-language links, covering many more African languages than any existing information retrieval test collection.In addition, we release BM25, dense retrieval, and sparse–dense hybrid baselines to provide a starting point for the development of future systems.We hope that these efforts can spur additional work in search for African languages.AfriCLIRMatrix can be downloaded at https://github.com/castorini/africlirmatrix.</abstract>
<identifier type="citekey">ogundepo-etal-2022-africlirmatrix</identifier>
<identifier type="doi">10.18653/v1/2022.emnlp-main.597</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.597/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>8721</start>
<end>8728</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AfriCLIRMatrix: Enabling Cross-Lingual Information Retrieval for African Languages
%A Ogundepo, Odunayo
%A Zhang, Xinyu
%A Sun, Shuo
%A Duh, Kevin
%A Lin, Jimmy
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F ogundepo-etal-2022-africlirmatrix
%X Language diversity in NLP is critical in enabling the development of tools for a wide range of users.However, there are limited resources for building such tools for many languages, particularly those spoken in Africa.For search, most existing datasets feature few or no African languages, directly impacting researchers’ ability to build and improve information access capabilities in those languages.Motivated by this, we created AfriCLIRMatrix, a test collection for cross-lingual information retrieval research in 15 diverse African languages.In total, our dataset contains 6 million queries in English and 23 million relevance judgments automatically mined from Wikipedia inter-language links, covering many more African languages than any existing information retrieval test collection.In addition, we release BM25, dense retrieval, and sparse–dense hybrid baselines to provide a starting point for the development of future systems.We hope that these efforts can spur additional work in search for African languages.AfriCLIRMatrix can be downloaded at https://github.com/castorini/africlirmatrix.
%R 10.18653/v1/2022.emnlp-main.597
%U https://aclanthology.org/2022.emnlp-main.597/
%U https://doi.org/10.18653/v1/2022.emnlp-main.597
%P 8721-8728
Markdown (Informal)
[AfriCLIRMatrix: Enabling Cross-Lingual Information Retrieval for African Languages](https://aclanthology.org/2022.emnlp-main.597/) (Ogundepo et al., EMNLP 2022)
ACL