@inproceedings{klubicka-etal-2022-challenges,
title = "Challenges of Building Domain-Specific Parallel Corpora from Public Administration Documents",
author = "Klubi{\v{c}}ka, Filip and
Kasuni{\'c}, Lorena and
Blazsetin, Danijel and
Bago, Petra",
editor = "Rapp, Reinhard and
Zweigenbaum, Pierre and
Sharoff, Serge",
booktitle = "Proceedings of the BUCC Workshop within LREC 2022",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.bucc-1.7",
pages = "50--55",
abstract = "PRINCIPLE was a Connecting Europe Facility (CEF)-funded project that focused on the identification, collection and processing of language resources (LRs) for four European under-resourced languages (Croatian, Icelandic, Irish and Norwegian) in order to improve translation quality of eTranslation, an online machine translation (MT) tool provided by the European Commission. The collected LRs were used for the development of neural MT engines in order to verify the quality of the resources. For all four languages, a total of 66 LRs were collected and made available on the ELRC-SHARE repository under various licenses. For Croatian, we have collected and published 20 LRs: 19 parallel corpora and 1 glossary. The majority of data is in the general domain (72 {\%} of translation units), while the rest is in the eJustice (23 {\%}), eHealth (3 {\%}) and eProcurement (2 {\%}) Digital Service Infrastructures (DSI) domains. The majority of the resources were for the Croatian-English language pair. The data was donated by six data contributors from the public as well as private sector. In this paper we present a subset of 13 Croatian LRs developed based on public administration documents, which are all made freely available, as well as challenges associated with the data collection, cleaning and processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="klubicka-etal-2022-challenges">
<titleInfo>
<title>Challenges of Building Domain-Specific Parallel Corpora from Public Administration Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Klubička</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lorena</namePart>
<namePart type="family">Kasunić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danijel</namePart>
<namePart type="family">Blazsetin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petra</namePart>
<namePart type="family">Bago</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BUCC Workshop within LREC 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>PRINCIPLE was a Connecting Europe Facility (CEF)-funded project that focused on the identification, collection and processing of language resources (LRs) for four European under-resourced languages (Croatian, Icelandic, Irish and Norwegian) in order to improve translation quality of eTranslation, an online machine translation (MT) tool provided by the European Commission. The collected LRs were used for the development of neural MT engines in order to verify the quality of the resources. For all four languages, a total of 66 LRs were collected and made available on the ELRC-SHARE repository under various licenses. For Croatian, we have collected and published 20 LRs: 19 parallel corpora and 1 glossary. The majority of data is in the general domain (72 % of translation units), while the rest is in the eJustice (23 %), eHealth (3 %) and eProcurement (2 %) Digital Service Infrastructures (DSI) domains. The majority of the resources were for the Croatian-English language pair. The data was donated by six data contributors from the public as well as private sector. In this paper we present a subset of 13 Croatian LRs developed based on public administration documents, which are all made freely available, as well as challenges associated with the data collection, cleaning and processing.</abstract>
<identifier type="citekey">klubicka-etal-2022-challenges</identifier>
<location>
<url>https://aclanthology.org/2022.bucc-1.7</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>50</start>
<end>55</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges of Building Domain-Specific Parallel Corpora from Public Administration Documents
%A Klubička, Filip
%A Kasunić, Lorena
%A Blazsetin, Danijel
%A Bago, Petra
%Y Rapp, Reinhard
%Y Zweigenbaum, Pierre
%Y Sharoff, Serge
%S Proceedings of the BUCC Workshop within LREC 2022
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F klubicka-etal-2022-challenges
%X PRINCIPLE was a Connecting Europe Facility (CEF)-funded project that focused on the identification, collection and processing of language resources (LRs) for four European under-resourced languages (Croatian, Icelandic, Irish and Norwegian) in order to improve translation quality of eTranslation, an online machine translation (MT) tool provided by the European Commission. The collected LRs were used for the development of neural MT engines in order to verify the quality of the resources. For all four languages, a total of 66 LRs were collected and made available on the ELRC-SHARE repository under various licenses. For Croatian, we have collected and published 20 LRs: 19 parallel corpora and 1 glossary. The majority of data is in the general domain (72 % of translation units), while the rest is in the eJustice (23 %), eHealth (3 %) and eProcurement (2 %) Digital Service Infrastructures (DSI) domains. The majority of the resources were for the Croatian-English language pair. The data was donated by six data contributors from the public as well as private sector. In this paper we present a subset of 13 Croatian LRs developed based on public administration documents, which are all made freely available, as well as challenges associated with the data collection, cleaning and processing.
%U https://aclanthology.org/2022.bucc-1.7
%P 50-55
Markdown (Informal)
[Challenges of Building Domain-Specific Parallel Corpora from Public Administration Documents](https://aclanthology.org/2022.bucc-1.7) (Klubička et al., BUCC 2022)
ACL