@inproceedings{barbaresi-lejeune-2020-box,
title = "Out-of-the-Box and into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools",
author = {Barbaresi, Adrien and
Lejeune, Ga{\"e}l},
booktitle = "Proceedings of the 12th Web as Corpus Workshop",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.wac-1.2",
pages = "5--13",
abstract = "This article examines extraction methods designed to retain the main text content of web pages and discusses how the extraction could be oriented and evaluated: can and should it be as generic as possible to ensure opportunistic corpus construction? The evaluation grounds on a comparative benchmark of open-source tools used on pages in five different languages (Chinese, English, Greek, Polish and Russian), it features several metrics to obtain more fine-grained differentiations. Our experiments highlight the diversity of web page layouts across languages or publishing countries. These discrepancies are reflected by diverging performances so that the right tool has to be chosen accordingly.",
language = "English",
ISBN = "979-10-95546-68-9",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="barbaresi-lejeune-2020-box">
<titleInfo>
<title>Out-of-the-Box and into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaël</namePart>
<namePart type="family">Lejeune</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Web as Corpus Workshop</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-68-9</identifier>
</relatedItem>
<abstract>This article examines extraction methods designed to retain the main text content of web pages and discusses how the extraction could be oriented and evaluated: can and should it be as generic as possible to ensure opportunistic corpus construction? The evaluation grounds on a comparative benchmark of open-source tools used on pages in five different languages (Chinese, English, Greek, Polish and Russian), it features several metrics to obtain more fine-grained differentiations. Our experiments highlight the diversity of web page layouts across languages or publishing countries. These discrepancies are reflected by diverging performances so that the right tool has to be chosen accordingly.</abstract>
<identifier type="citekey">barbaresi-lejeune-2020-box</identifier>
<location>
<url>https://aclanthology.org/2020.wac-1.2</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>5</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Out-of-the-Box and into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools
%A Barbaresi, Adrien
%A Lejeune, Gaël
%S Proceedings of the 12th Web as Corpus Workshop
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-68-9
%G English
%F barbaresi-lejeune-2020-box
%X This article examines extraction methods designed to retain the main text content of web pages and discusses how the extraction could be oriented and evaluated: can and should it be as generic as possible to ensure opportunistic corpus construction? The evaluation grounds on a comparative benchmark of open-source tools used on pages in five different languages (Chinese, English, Greek, Polish and Russian), it features several metrics to obtain more fine-grained differentiations. Our experiments highlight the diversity of web page layouts across languages or publishing countries. These discrepancies are reflected by diverging performances so that the right tool has to be chosen accordingly.
%U https://aclanthology.org/2020.wac-1.2
%P 5-13
Markdown (Informal)
[Out-of-the-Box and into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools](https://aclanthology.org/2020.wac-1.2) (Barbaresi & Lejeune, WAC 2020)
ACL