@InProceedings{hazem-morin:2016:COLING,
  author    = {Hazem, Amir  and  Morin, Emmanuel},
  title     = {Efficient Data Selection for Bilingual Terminology Extraction from Comparable Corpora},
  booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
  month     = {December},
  year      = {2016},
  address   = {Osaka, Japan},
  publisher = {The COLING 2016 Organizing Committee},
  pages     = {3401--3411},
  abstract  = {Comparable corpora are the main alternative to the use of parallel corpora to
	extract bilingual lexicons. Although it is easier to build comparable corpora,
	specialized comparable corpora are often of modest size in comparison with
	corpora issued from the general domain. Consequently, the observations of word
	co-occurrences which are the basis of context-based methods are unreliable. We
	propose in this article to improve word co-occurrences of specialized
	comparable corpora and thus context representation by using general-domain
	data. This idea, which has been already used in machine translation task for
	more than a decade, is not  straightforward for the task of bilingual lexicon
	extraction from specific-domain comparable corpora. We go against the
	mainstream of this task where many studies support the idea that adding
	out-of-domain documents decreases the quality of lexicons. Our empirical
	evaluation shows the advantages of this approach which induces a significant
	gain in the accuracy of extracted lexicons.},
  url       = {http://aclweb.org/anthology/C16-1321}
}