@inproceedings{zavorin-etal-2020-corpora,
title = "Corpora for Cross-Language Information Retrieval in Six Less-Resourced Languages",
author = "Zavorin, Ilya and
Bills, Aric and
Corey, Cassian and
Morrison, Michelle and
Tong, Audrey and
Tong, Richard",
editor = "McKeown, Kathy and
Oard, Douglas W. and
{Elizabeth} and
Schwartz, Richard",
booktitle = "Proceedings of the workshop on Cross-Language Search and Summarization of Text and Speech (CLSSTS2020)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.clssts-1.2",
pages = "7--13",
abstract = "The Machine Translation for English Retrieval of Information in Any Language (MATERIAL) research program, sponsored by the Intelligence Advanced Research Projects Activity (IARPA), focuses on rapid development of end-to-end systems capable of retrieving foreign language speech and text documents relevant to different types of English queries that may be further restricted by domain. Those systems also provide evidence of relevance of the retrieved content in the form of English summaries. The program focuses on Less-Resourced Languages and provides its performer teams very limited amounts of annotated training data. This paper describes the corpora that were created for system development and evaluation for the six languages released by the program to date: Tagalog, Swahili, Somali, Lithuanian, Bulgarian and Pashto. The corpora include build packs to train Machine Translation and Automatic Speech Recognition systems; document sets in three text and three speech genres annotated for domain and partitioned for analysis, development and evaluation; and queries of several types together with corresponding binary relevance judgments against the entire set of documents. The paper also describes a detection metric called Actual Query Weighted Value developed by the program to evaluate end-to-end system performance.",
language = "English",
ISBN = "979-10-95546-55-9",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zavorin-etal-2020-corpora">
<titleInfo>
<title>Corpora for Cross-Language Information Retrieval in Six Less-Resourced Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilya</namePart>
<namePart type="family">Zavorin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aric</namePart>
<namePart type="family">Bills</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cassian</namePart>
<namePart type="family">Corey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michelle</namePart>
<namePart type="family">Morrison</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Audrey</namePart>
<namePart type="family">Tong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Tong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the workshop on Cross-Language Search and Summarization of Text and Speech (CLSSTS2020)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kathy</namePart>
<namePart type="family">McKeown</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Douglas</namePart>
<namePart type="given">W</namePart>
<namePart type="family">Oard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name>
<namePart>Elizabeth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-55-9</identifier>
</relatedItem>
<abstract>The Machine Translation for English Retrieval of Information in Any Language (MATERIAL) research program, sponsored by the Intelligence Advanced Research Projects Activity (IARPA), focuses on rapid development of end-to-end systems capable of retrieving foreign language speech and text documents relevant to different types of English queries that may be further restricted by domain. Those systems also provide evidence of relevance of the retrieved content in the form of English summaries. The program focuses on Less-Resourced Languages and provides its performer teams very limited amounts of annotated training data. This paper describes the corpora that were created for system development and evaluation for the six languages released by the program to date: Tagalog, Swahili, Somali, Lithuanian, Bulgarian and Pashto. The corpora include build packs to train Machine Translation and Automatic Speech Recognition systems; document sets in three text and three speech genres annotated for domain and partitioned for analysis, development and evaluation; and queries of several types together with corresponding binary relevance judgments against the entire set of documents. The paper also describes a detection metric called Actual Query Weighted Value developed by the program to evaluate end-to-end system performance.</abstract>
<identifier type="citekey">zavorin-etal-2020-corpora</identifier>
<location>
<url>https://aclanthology.org/2020.clssts-1.2</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>7</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Corpora for Cross-Language Information Retrieval in Six Less-Resourced Languages
%A Zavorin, Ilya
%A Bills, Aric
%A Corey, Cassian
%A Morrison, Michelle
%A Tong, Audrey
%A Tong, Richard
%Y McKeown, Kathy
%Y Oard, Douglas W.
%Y Schwartz, Richard
%E Elizabeth
%S Proceedings of the workshop on Cross-Language Search and Summarization of Text and Speech (CLSSTS2020)
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-55-9
%G English
%F zavorin-etal-2020-corpora
%X The Machine Translation for English Retrieval of Information in Any Language (MATERIAL) research program, sponsored by the Intelligence Advanced Research Projects Activity (IARPA), focuses on rapid development of end-to-end systems capable of retrieving foreign language speech and text documents relevant to different types of English queries that may be further restricted by domain. Those systems also provide evidence of relevance of the retrieved content in the form of English summaries. The program focuses on Less-Resourced Languages and provides its performer teams very limited amounts of annotated training data. This paper describes the corpora that were created for system development and evaluation for the six languages released by the program to date: Tagalog, Swahili, Somali, Lithuanian, Bulgarian and Pashto. The corpora include build packs to train Machine Translation and Automatic Speech Recognition systems; document sets in three text and three speech genres annotated for domain and partitioned for analysis, development and evaluation; and queries of several types together with corresponding binary relevance judgments against the entire set of documents. The paper also describes a detection metric called Actual Query Weighted Value developed by the program to evaluate end-to-end system performance.
%U https://aclanthology.org/2020.clssts-1.2
%P 7-13
Markdown (Informal)
[Corpora for Cross-Language Information Retrieval in Six Less-Resourced Languages](https://aclanthology.org/2020.clssts-1.2) (Zavorin et al., CLSSTS 2020)
ACL