@inproceedings{pimienta-2022-resource,
title = "Resource: Indicators on the Presence of Languages in {I}nternet",
author = "Pimienta, Daniel",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.sigul-1.11",
pages = "83--91",
abstract = "Reliable and maintained indicators of the space of languages on the Internet are required to support appropriate public policies and well-informed linguistic studies. Current sources are scarce and often strongly biased. The model to produce indicators on the presence of languages in the Internet, launched by the Observatory in 2017, has reached a sensible level of maturity and its data products are shared in CC-BY-SA 4.0 license. It reaches now 329 languages (L1 speakers {\textgreater} one million) and all the biases associated with the model have been controlled to an acceptable threshold, giving trust to the data, within an estimated confidence interval of +-20{\%}. Some of the indicators (mainly the percentage of L1+L2 speakers connected to the Internet per language and derivates) rely on Ethnologue Global Dataset {\#}24 for demo-linguistic data and ITU, completed by World Bank, for the percentage of persons connected to the Internet by country. The rest of indicators relies on the previous sources plus a large combination of hundreds of different sources for data related to Web contents per language. This research poster focuses the description of the new linguistic resources created. Methodological considerations are only exposed briefly and will be developed in another paper.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pimienta-2022-resource">
<titleInfo>
<title>Resource: Indicators on the Presence of Languages in Internet</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Pimienta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Reliable and maintained indicators of the space of languages on the Internet are required to support appropriate public policies and well-informed linguistic studies. Current sources are scarce and often strongly biased. The model to produce indicators on the presence of languages in the Internet, launched by the Observatory in 2017, has reached a sensible level of maturity and its data products are shared in CC-BY-SA 4.0 license. It reaches now 329 languages (L1 speakers \textgreater one million) and all the biases associated with the model have been controlled to an acceptable threshold, giving trust to the data, within an estimated confidence interval of +-20%. Some of the indicators (mainly the percentage of L1+L2 speakers connected to the Internet per language and derivates) rely on Ethnologue Global Dataset #24 for demo-linguistic data and ITU, completed by World Bank, for the percentage of persons connected to the Internet by country. The rest of indicators relies on the previous sources plus a large combination of hundreds of different sources for data related to Web contents per language. This research poster focuses the description of the new linguistic resources created. Methodological considerations are only exposed briefly and will be developed in another paper.</abstract>
<identifier type="citekey">pimienta-2022-resource</identifier>
<location>
<url>https://aclanthology.org/2022.sigul-1.11</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>83</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Resource: Indicators on the Presence of Languages in Internet
%A Pimienta, Daniel
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Annual Meeting of the ELRA/ISCA Special Interest Group on Under-Resourced Languages
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F pimienta-2022-resource
%X Reliable and maintained indicators of the space of languages on the Internet are required to support appropriate public policies and well-informed linguistic studies. Current sources are scarce and often strongly biased. The model to produce indicators on the presence of languages in the Internet, launched by the Observatory in 2017, has reached a sensible level of maturity and its data products are shared in CC-BY-SA 4.0 license. It reaches now 329 languages (L1 speakers \textgreater one million) and all the biases associated with the model have been controlled to an acceptable threshold, giving trust to the data, within an estimated confidence interval of +-20%. Some of the indicators (mainly the percentage of L1+L2 speakers connected to the Internet per language and derivates) rely on Ethnologue Global Dataset #24 for demo-linguistic data and ITU, completed by World Bank, for the percentage of persons connected to the Internet by country. The rest of indicators relies on the previous sources plus a large combination of hundreds of different sources for data related to Web contents per language. This research poster focuses the description of the new linguistic resources created. Methodological considerations are only exposed briefly and will be developed in another paper.
%U https://aclanthology.org/2022.sigul-1.11
%P 83-91
Markdown (Informal)
[Resource: Indicators on the Presence of Languages in Internet](https://aclanthology.org/2022.sigul-1.11) (Pimienta, SIGUL 2022)
ACL