@inproceedings{grouin-2016-text,
title = "Text Segmentation of Digitized Clinical Texts",
author = "Grouin, Cyril",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1570",
pages = "3592--3599",
abstract = "In this paper, we present the experiments we made to recover the original page layout structure into two columns from layout damaged digitized files. We designed several CRF-based approaches, either to identify column separator or to classify each token from each line into left or right columns. We achieved our best results with a model trained on homogeneous corpora (only files composed of 2 columns) when classifying each token into left or right columns (overall F-measure of 0.968). Our experiments show it is possible to recover the original layout in columns of digitized documents with results of quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="grouin-2016-text">
<titleInfo>
<title>Text Segmentation of Digitized Clinical Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Grouin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we present the experiments we made to recover the original page layout structure into two columns from layout damaged digitized files. We designed several CRF-based approaches, either to identify column separator or to classify each token from each line into left or right columns. We achieved our best results with a model trained on homogeneous corpora (only files composed of 2 columns) when classifying each token into left or right columns (overall F-measure of 0.968). Our experiments show it is possible to recover the original layout in columns of digitized documents with results of quality.</abstract>
<identifier type="citekey">grouin-2016-text</identifier>
<location>
<url>https://aclanthology.org/L16-1570</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>3592</start>
<end>3599</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text Segmentation of Digitized Clinical Texts
%A Grouin, Cyril
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F grouin-2016-text
%X In this paper, we present the experiments we made to recover the original page layout structure into two columns from layout damaged digitized files. We designed several CRF-based approaches, either to identify column separator or to classify each token from each line into left or right columns. We achieved our best results with a model trained on homogeneous corpora (only files composed of 2 columns) when classifying each token into left or right columns (overall F-measure of 0.968). Our experiments show it is possible to recover the original layout in columns of digitized documents with results of quality.
%U https://aclanthology.org/L16-1570
%P 3592-3599
Markdown (Informal)
[Text Segmentation of Digitized Clinical Texts](https://aclanthology.org/L16-1570) (Grouin, LREC 2016)
ACL
- Cyril Grouin. 2016. Text Segmentation of Digitized Clinical Texts. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16), pages 3592–3599, Portorož, Slovenia. European Language Resources Association (ELRA).