@inproceedings{riedl-etal-2019-clustering,
title = "Clustering-Based Article Identification in Historical Newspapers",
author = "Riedl, Martin and
Betz, Daniela and
Pad{\'o}, Sebastian",
editor = "Alex, Beatrice and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 3rd Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = jun,
year = "2019",
address = "Minneapolis, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-2502",
doi = "10.18653/v1/W19-2502",
pages = "12--17",
abstract = "This article focuses on the problem of identifying articles and recovering their text from within and across newspaper pages when OCR just delivers one text file per page. We frame the task as a segmentation plus clustering step. Our results on a sample of 1912 New York Tribune magazine shows that performing the clustering based on similarities computed with word embeddings outperforms a similarity measure based on character n-grams and words. Furthermore, the automatic segmentation based on the text results in low scores, due to the low quality of some OCRed documents.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="riedl-etal-2019-clustering">
<titleInfo>
<title>Clustering-Based Article Identification in Historical Newspapers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Riedl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniela</namePart>
<namePart type="family">Betz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Padó</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Beatrice</namePart>
<namePart type="family">Alex</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This article focuses on the problem of identifying articles and recovering their text from within and across newspaper pages when OCR just delivers one text file per page. We frame the task as a segmentation plus clustering step. Our results on a sample of 1912 New York Tribune magazine shows that performing the clustering based on similarities computed with word embeddings outperforms a similarity measure based on character n-grams and words. Furthermore, the automatic segmentation based on the text results in low scores, due to the low quality of some OCRed documents.</abstract>
<identifier type="citekey">riedl-etal-2019-clustering</identifier>
<identifier type="doi">10.18653/v1/W19-2502</identifier>
<location>
<url>https://aclanthology.org/W19-2502</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>12</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Clustering-Based Article Identification in Historical Newspapers
%A Riedl, Martin
%A Betz, Daniela
%A Padó, Sebastian
%Y Alex, Beatrice
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 3rd Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, USA
%F riedl-etal-2019-clustering
%X This article focuses on the problem of identifying articles and recovering their text from within and across newspaper pages when OCR just delivers one text file per page. We frame the task as a segmentation plus clustering step. Our results on a sample of 1912 New York Tribune magazine shows that performing the clustering based on similarities computed with word embeddings outperforms a similarity measure based on character n-grams and words. Furthermore, the automatic segmentation based on the text results in low scores, due to the low quality of some OCRed documents.
%R 10.18653/v1/W19-2502
%U https://aclanthology.org/W19-2502
%U https://doi.org/10.18653/v1/W19-2502
%P 12-17
Markdown (Informal)
[Clustering-Based Article Identification in Historical Newspapers](https://aclanthology.org/W19-2502) (Riedl et al., LaTeCH 2019)
ACL
- Martin Riedl, Daniela Betz, and Sebastian Padó. 2019. Clustering-Based Article Identification in Historical Newspapers. In Proceedings of the 3rd Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pages 12–17, Minneapolis, USA. Association for Computational Linguistics.