@inproceedings{poujade-etal-2024-corpusarieja,
title = "{C}orpus{A}ri{\`e}ja: Building an Annotated Corpus with Variation in {O}ccitan",
author = "Poujade, Clamenca and
Bras, Myriam and
Urieli, Assaf",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.9",
pages = "66--71",
abstract = "The Occitan language is a less resourced language and is classified as {`}in danger{'} by the UNESCO. Thereby, it is important to build resources and tools that can help to safeguard and develop the digitisation of the language. CorpusAri{\`e}ja is a collection of 72 texts (just over 41,000 tokens) in the Occitan language of the French department of Ari{\`e}ge. The majority of the texts needed to be digitised and pass within an Optical Character Recognition. This corpus contains dialectal and spelling variation, but is limited to prose, without diachronic variation or genre variation. It is an annotated corpus with two levels of lemmatisation, POS tags and verbal inflection. One of the main aims of the corpus is to enable the conception of tools that can automatically annotate all Occitan texts, regardless of the dialect or spelling used. The Ari{\`e}ge territory is interesting because it includes the two variations that we focus on, dialectal and spelling. It has plenty of authors that write in their native language, their variety of Occitan.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="poujade-etal-2024-corpusarieja">
<titleInfo>
<title>CorpusArièja: Building an Annotated Corpus with Variation in Occitan</title>
</titleInfo>
<name type="personal">
<namePart type="given">Clamenca</namePart>
<namePart type="family">Poujade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Myriam</namePart>
<namePart type="family">Bras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Assaf</namePart>
<namePart type="family">Urieli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Occitan language is a less resourced language and is classified as ‘in danger’ by the UNESCO. Thereby, it is important to build resources and tools that can help to safeguard and develop the digitisation of the language. CorpusArièja is a collection of 72 texts (just over 41,000 tokens) in the Occitan language of the French department of Ariège. The majority of the texts needed to be digitised and pass within an Optical Character Recognition. This corpus contains dialectal and spelling variation, but is limited to prose, without diachronic variation or genre variation. It is an annotated corpus with two levels of lemmatisation, POS tags and verbal inflection. One of the main aims of the corpus is to enable the conception of tools that can automatically annotate all Occitan texts, regardless of the dialect or spelling used. The Ariège territory is interesting because it includes the two variations that we focus on, dialectal and spelling. It has plenty of authors that write in their native language, their variety of Occitan.</abstract>
<identifier type="citekey">poujade-etal-2024-corpusarieja</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.9</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>66</start>
<end>71</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CorpusArièja: Building an Annotated Corpus with Variation in Occitan
%A Poujade, Clamenca
%A Bras, Myriam
%A Urieli, Assaf
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F poujade-etal-2024-corpusarieja
%X The Occitan language is a less resourced language and is classified as ‘in danger’ by the UNESCO. Thereby, it is important to build resources and tools that can help to safeguard and develop the digitisation of the language. CorpusArièja is a collection of 72 texts (just over 41,000 tokens) in the Occitan language of the French department of Ariège. The majority of the texts needed to be digitised and pass within an Optical Character Recognition. This corpus contains dialectal and spelling variation, but is limited to prose, without diachronic variation or genre variation. It is an annotated corpus with two levels of lemmatisation, POS tags and verbal inflection. One of the main aims of the corpus is to enable the conception of tools that can automatically annotate all Occitan texts, regardless of the dialect or spelling used. The Ariège territory is interesting because it includes the two variations that we focus on, dialectal and spelling. It has plenty of authors that write in their native language, their variety of Occitan.
%U https://aclanthology.org/2024.sigul-1.9
%P 66-71
Markdown (Informal)
[CorpusArièja: Building an Annotated Corpus with Variation in Occitan](https://aclanthology.org/2024.sigul-1.9) (Poujade et al., SIGUL-WS 2024)
ACL