@inproceedings{L16-1570,
 abstract = {In this paper, we present the experiments we made to recover the original page layout structure into two columns from layout damaged digitized files. We designed several CRF-based approaches, either to identify column separator or to classify each token from each line into left or right columns. We achieved our best results with a model trained on homogeneous corpora (only files composed of 2 columns) when classifying each token into left or right columns (overall F-measure of 0.968). Our experiments show it is possible to recover the original layout in columns of digitized documents with results of quality.
},
 address = {Portorož, Slovenia},
 author = {Cyril Grouin},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {3592--3599},
 publisher = {European Language Resources Association (ELRA)},
 title = {Text Segmentation of Digitized Clinical Texts},
 url = {https://www.aclweb.org/anthology/L16-1570},
 year = {2016}
}

