@inproceedings{o-meachair-etal-2022-introducing,
title = "Introducing the National Corpus of {I}rish Project",
author = "{\'O} Meachair, M{\'\i}che{\'a}l and
Bhreathnach, {\'U}na and
{\'O} Cleirc{\'\i}n, Gear{\'o}id",
editor = "Fransen, Theodorus and
Lamb, William and
Prys, Delyth",
booktitle = "Proceedings of the 4th Celtic Language Technology Workshop within LREC2022",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.cltw-1.14",
pages = "99--103",
abstract = "This paper introduces the National Corpus of Irish, an initiative to develop a large national corpus of written and spoken contemporary Irish as well as related specialised corpora. The newly-compiled corpora will be hosted at corpas.ie, in what will become a hub for corpus-based research on the Irish language. Users will be able to search the corpora and download data generated during the project from the corpas.ie website and appropriate third-party repositories. Corpus 1 will be a balanced general-purpose corpus containing c.155m words. Corpus 2 will be a written corpus consisting of c100m words. Corpus 3 will be a spoken corpus containing 6.5m words. Corpus 4 will be a monitor corpus with a target size of 1m words per year from 2000 onwards. Token, lemma, and n-gram frequency lists will be published at regular intervals on the project website, and language models will be published there and on other appropriate platforms during the course of the project. This paper focuses on the background and crucial scoping stage of the project, and examines user needs as identified in a survey of potential users.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="o-meachair-etal-2022-introducing">
<titleInfo>
<title>Introducing the National Corpus of Irish Project</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mícheál</namePart>
<namePart type="family">Ó Meachair</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Úna</namePart>
<namePart type="family">Bhreathnach</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gearóid</namePart>
<namePart type="family">Ó Cleircín</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Celtic Language Technology Workshop within LREC2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Theodorus</namePart>
<namePart type="family">Fransen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Lamb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Delyth</namePart>
<namePart type="family">Prys</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the National Corpus of Irish, an initiative to develop a large national corpus of written and spoken contemporary Irish as well as related specialised corpora. The newly-compiled corpora will be hosted at corpas.ie, in what will become a hub for corpus-based research on the Irish language. Users will be able to search the corpora and download data generated during the project from the corpas.ie website and appropriate third-party repositories. Corpus 1 will be a balanced general-purpose corpus containing c.155m words. Corpus 2 will be a written corpus consisting of c100m words. Corpus 3 will be a spoken corpus containing 6.5m words. Corpus 4 will be a monitor corpus with a target size of 1m words per year from 2000 onwards. Token, lemma, and n-gram frequency lists will be published at regular intervals on the project website, and language models will be published there and on other appropriate platforms during the course of the project. This paper focuses on the background and crucial scoping stage of the project, and examines user needs as identified in a survey of potential users.</abstract>
<identifier type="citekey">o-meachair-etal-2022-introducing</identifier>
<location>
<url>https://aclanthology.org/2022.cltw-1.14</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>99</start>
<end>103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Introducing the National Corpus of Irish Project
%A Ó Meachair, Mícheál
%A Bhreathnach, Úna
%A Ó Cleircín, Gearóid
%Y Fransen, Theodorus
%Y Lamb, William
%Y Prys, Delyth
%S Proceedings of the 4th Celtic Language Technology Workshop within LREC2022
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F o-meachair-etal-2022-introducing
%X This paper introduces the National Corpus of Irish, an initiative to develop a large national corpus of written and spoken contemporary Irish as well as related specialised corpora. The newly-compiled corpora will be hosted at corpas.ie, in what will become a hub for corpus-based research on the Irish language. Users will be able to search the corpora and download data generated during the project from the corpas.ie website and appropriate third-party repositories. Corpus 1 will be a balanced general-purpose corpus containing c.155m words. Corpus 2 will be a written corpus consisting of c100m words. Corpus 3 will be a spoken corpus containing 6.5m words. Corpus 4 will be a monitor corpus with a target size of 1m words per year from 2000 onwards. Token, lemma, and n-gram frequency lists will be published at regular intervals on the project website, and language models will be published there and on other appropriate platforms during the course of the project. This paper focuses on the background and crucial scoping stage of the project, and examines user needs as identified in a survey of potential users.
%U https://aclanthology.org/2022.cltw-1.14
%P 99-103
Markdown (Informal)
[Introducing the National Corpus of Irish Project](https://aclanthology.org/2022.cltw-1.14) (Ó Meachair et al., CLTW 2022)
ACL
- Mícheál Ó Meachair, Úna Bhreathnach, and Gearóid Ó Cleircín. 2022. Introducing the National Corpus of Irish Project. In Proceedings of the 4th Celtic Language Technology Workshop within LREC2022, pages 99–103, Marseille, France. European Language Resources Association.