@inproceedings{zweigenbaum-etal-2016-supervised,
title = "Supervised classification of end-of-lines in clinical text with no manual annotation",
author = "Zweigenbaum, Pierre and
Grouin, Cyril and
Lavergne, Thomas",
editor = "Ananiadou, Sophia and
Batista-Navarro, Riza and
Cohen, Kevin Bretonnel and
Demner-Fushman, Dina and
Thompson, Paul",
booktitle = "Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining ({B}io{T}xt{M}2016)",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-5109",
pages = "80--88",
abstract = "In some plain text documents, end-of-line marks may or may not mark the boundary of a text unit (e.g., of a paragraph). This vexing problem is likely to impact subsequent natural language processing components, but is seldom addressed in the literature. We propose a method which uses no manual annotation to classify whether end-of-lines must actually be seen as simple spaces (soft line breaks) or as true text unit boundaries. This method, which includes self-training and co-training steps based on token and line length features, achieves 0.943 F-measure on a corpus of short e-books with controlled format, F=0.904 on a random sample of 24 clinical texts with soft line breaks, and F=0.898 on a larger set of mixed clinical texts which may or may not contain soft line breaks, a fairly high value for a method with no manual annotation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zweigenbaum-etal-2016-supervised">
<titleInfo>
<title>Supervised classification of end-of-lines in clinical text with no manual annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Grouin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Lavergne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2016)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riza</namePart>
<namePart type="family">Batista-Navarro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In some plain text documents, end-of-line marks may or may not mark the boundary of a text unit (e.g., of a paragraph). This vexing problem is likely to impact subsequent natural language processing components, but is seldom addressed in the literature. We propose a method which uses no manual annotation to classify whether end-of-lines must actually be seen as simple spaces (soft line breaks) or as true text unit boundaries. This method, which includes self-training and co-training steps based on token and line length features, achieves 0.943 F-measure on a corpus of short e-books with controlled format, F=0.904 on a random sample of 24 clinical texts with soft line breaks, and F=0.898 on a larger set of mixed clinical texts which may or may not contain soft line breaks, a fairly high value for a method with no manual annotation.</abstract>
<identifier type="citekey">zweigenbaum-etal-2016-supervised</identifier>
<location>
<url>https://aclanthology.org/W16-5109</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>80</start>
<end>88</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Supervised classification of end-of-lines in clinical text with no manual annotation
%A Zweigenbaum, Pierre
%A Grouin, Cyril
%A Lavergne, Thomas
%Y Ananiadou, Sophia
%Y Batista-Navarro, Riza
%Y Cohen, Kevin Bretonnel
%Y Demner-Fushman, Dina
%Y Thompson, Paul
%S Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2016)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F zweigenbaum-etal-2016-supervised
%X In some plain text documents, end-of-line marks may or may not mark the boundary of a text unit (e.g., of a paragraph). This vexing problem is likely to impact subsequent natural language processing components, but is seldom addressed in the literature. We propose a method which uses no manual annotation to classify whether end-of-lines must actually be seen as simple spaces (soft line breaks) or as true text unit boundaries. This method, which includes self-training and co-training steps based on token and line length features, achieves 0.943 F-measure on a corpus of short e-books with controlled format, F=0.904 on a random sample of 24 clinical texts with soft line breaks, and F=0.898 on a larger set of mixed clinical texts which may or may not contain soft line breaks, a fairly high value for a method with no manual annotation.
%U https://aclanthology.org/W16-5109
%P 80-88
Markdown (Informal)
[Supervised classification of end-of-lines in clinical text with no manual annotation](https://aclanthology.org/W16-5109) (Zweigenbaum et al., 2016)
ACL