@inproceedings{zavarsky-etal-2005-language,
title = "Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text",
author = "Zavarsky, Pavol and
Mikami, Yoshiki and
Wada, Shota",
booktitle = "Proceedings of Machine Translation Summit X: Posters",
month = sep # " 13-15",
year = "2005",
address = "Phuket, Thailand",
url = "https://aclanthology.org/2005.mtsummit-posters.5",
pages = "354--355",
abstract = "In the paper we present an outline of our approach to identify languages and encoding schemes in extremely large sets of multi-lingual documents. The large sets we are analyzing in our Language Observatory project [1] are formed by dozens of millions of text documents. In the paper we present an approach which allows us to analyze about 250 documents every second (about 20 million documents/day) on a single Linux machine. Using a multithread processing on a cluster of Linux servers we are able to analyze easily more than 100 million documents/day.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zavarsky-etal-2005-language">
<titleInfo>
<title>Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pavol</namePart>
<namePart type="family">Zavarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoshiki</namePart>
<namePart type="family">Mikami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shota</namePart>
<namePart type="family">Wada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2005-sep 13-15</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit X: Posters</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">Phuket, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the paper we present an outline of our approach to identify languages and encoding schemes in extremely large sets of multi-lingual documents. The large sets we are analyzing in our Language Observatory project [1] are formed by dozens of millions of text documents. In the paper we present an approach which allows us to analyze about 250 documents every second (about 20 million documents/day) on a single Linux machine. Using a multithread processing on a cluster of Linux servers we are able to analyze easily more than 100 million documents/day.</abstract>
<identifier type="citekey">zavarsky-etal-2005-language</identifier>
<location>
<url>https://aclanthology.org/2005.mtsummit-posters.5</url>
</location>
<part>
<date>2005-sep 13-15</date>
<extent unit="page">
<start>354</start>
<end>355</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text
%A Zavarsky, Pavol
%A Mikami, Yoshiki
%A Wada, Shota
%S Proceedings of Machine Translation Summit X: Posters
%D 2005
%8 sep 13 15
%C Phuket, Thailand
%F zavarsky-etal-2005-language
%X In the paper we present an outline of our approach to identify languages and encoding schemes in extremely large sets of multi-lingual documents. The large sets we are analyzing in our Language Observatory project [1] are formed by dozens of millions of text documents. In the paper we present an approach which allows us to analyze about 250 documents every second (about 20 million documents/day) on a single Linux machine. Using a multithread processing on a cluster of Linux servers we are able to analyze easily more than 100 million documents/day.
%U https://aclanthology.org/2005.mtsummit-posters.5
%P 354-355
Markdown (Informal)
[Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text](https://aclanthology.org/2005.mtsummit-posters.5) (Zavarsky et al., MTSummit 2005)
ACL