@inproceedings{mccarthy-etal-2020-johns,
title = "The {J}ohns {H}opkins {U}niversity {B}ible Corpus: 1600+ Tongues for Typological Exploration",
author = "McCarthy, Arya D. and
Wicks, Rachel and
Lewis, Dylan and
Mueller, Aaron and
Wu, Winston and
Adams, Oliver and
Nicolai, Garrett and
Post, Matt and
Yarowsky, David",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.352",
pages = "2884--2892",
abstract = "We present findings from the creation of a massively parallel corpus in over 1600 languages, the Johns Hopkins University Bible Corpus (JHUBC). The corpus consists of over 4000 unique translations of the Christian Bible and counting. Our data is derived from scraping several online resources and merging them with existing corpora, combining them under a common scheme that is verse-parallel across all translations. We detail our effort to scrape, clean, align, and utilize this ripe multilingual dataset. The corpus captures the great typological variety of the world{'}s languages. We catalog this by showing highly similar proportions of representation of Ethnologue{'}s typological features in our corpus. We also give an example application: projecting pronoun features like clusivity across alignments to richly annotate languages which do not mark the distinction.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mccarthy-etal-2020-johns">
<titleInfo>
<title>The Johns Hopkins University Bible Corpus: 1600+ Tongues for Typological Exploration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="given">D</namePart>
<namePart type="family">McCarthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachel</namePart>
<namePart type="family">Wicks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dylan</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Winston</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Adams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Post</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Yarowsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>We present findings from the creation of a massively parallel corpus in over 1600 languages, the Johns Hopkins University Bible Corpus (JHUBC). The corpus consists of over 4000 unique translations of the Christian Bible and counting. Our data is derived from scraping several online resources and merging them with existing corpora, combining them under a common scheme that is verse-parallel across all translations. We detail our effort to scrape, clean, align, and utilize this ripe multilingual dataset. The corpus captures the great typological variety of the world’s languages. We catalog this by showing highly similar proportions of representation of Ethnologue’s typological features in our corpus. We also give an example application: projecting pronoun features like clusivity across alignments to richly annotate languages which do not mark the distinction.</abstract>
<identifier type="citekey">mccarthy-etal-2020-johns</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.352</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>2884</start>
<end>2892</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Johns Hopkins University Bible Corpus: 1600+ Tongues for Typological Exploration
%A McCarthy, Arya D.
%A Wicks, Rachel
%A Lewis, Dylan
%A Mueller, Aaron
%A Wu, Winston
%A Adams, Oliver
%A Nicolai, Garrett
%A Post, Matt
%A Yarowsky, David
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F mccarthy-etal-2020-johns
%X We present findings from the creation of a massively parallel corpus in over 1600 languages, the Johns Hopkins University Bible Corpus (JHUBC). The corpus consists of over 4000 unique translations of the Christian Bible and counting. Our data is derived from scraping several online resources and merging them with existing corpora, combining them under a common scheme that is verse-parallel across all translations. We detail our effort to scrape, clean, align, and utilize this ripe multilingual dataset. The corpus captures the great typological variety of the world’s languages. We catalog this by showing highly similar proportions of representation of Ethnologue’s typological features in our corpus. We also give an example application: projecting pronoun features like clusivity across alignments to richly annotate languages which do not mark the distinction.
%U https://aclanthology.org/2020.lrec-1.352
%P 2884-2892
Markdown (Informal)
[The Johns Hopkins University Bible Corpus: 1600+ Tongues for Typological Exploration](https://aclanthology.org/2020.lrec-1.352) (McCarthy et al., LREC 2020)
ACL
- Arya D. McCarthy, Rachel Wicks, Dylan Lewis, Aaron Mueller, Winston Wu, Oliver Adams, Garrett Nicolai, Matt Post, and David Yarowsky. 2020. The Johns Hopkins University Bible Corpus: 1600+ Tongues for Typological Exploration. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 2884–2892, Marseille, France. European Language Resources Association.