@InProceedings{dhondt-EtAl:2016:BioTxtM2016,
  author    = {D'hondt, Eva  and  Grouin, Cyril  and  Neveol, Aurelie  and  Stamatatos, Efstathios  and  Zweigenbaum, Pierre},
  title     = {Detection of Text Reuse in French Medical Corpora},
  booktitle = {Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM2016)},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {108--114},
  abstract  = {Electronic Health Records (EHRs) are increasingly available in modern health
	care institutions either through the direct creation of electronic documents in
	hospitals' health information systems, or through the digitization of
	historical paper records. Each EHR creation method yields the need for
	sophisticated text reuse detection tools in order to prepare the EHR
	collections for efficient secondary use relying on Natural Language Processing
	methods. Herein, we address the detection of two types of text reuse in French
	EHRs: 1) the detection of updated versions of the same document and 2) the
	detection of document duplicates that still bear surface differences due to OCR
	or de-identification processing. We present a robust text reuse detection
	method to automatically identify redundant document pairs in two French EHR
	corpora that achieves an overall macro F-measure of 0.68 and 0.60, respectively
	and correctly identifies all redundant document pairs of interest.},
  url       = {http://aclweb.org/anthology/W16-5112}
}

