@inproceedings{L16-1573,
 abstract = {In computer-mediated communication, Latin-based scripts users often omit diacritics when writing. Such text is typically easily understandable to humans but very difficult for computational processing because many words become ambiguous or unknown. Letter-level approaches to diacritic restoration generalise better and do not require a lot of training data but word-level approaches tend to yield better results. However, they typically rely on a lexicon which is an expensive resource, not covering non-standard forms, and often not available for less-resourced languages. In this paper we present diacritic restoration models that are trained on easy-to-acquire corpora. We test three different types of corpora (Wikipedia, general web, Twitter) for three South Slavic languages (Croatian, Serbian and Slovene) and evaluate them on two types of text: standard (Wikipedia) and non-standard (Twitter). The proposed approach considerably outperforms charlifter, so far the only open source tool available for this task. We make the best performing systems freely available.
},
 address = {Portorož, Slovenia},
 author = {Nikola Ljubešić and Tomaž Erjavec and Darja Fišer},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {3612--3616},
 publisher = {European Language Resources Association (ELRA)},
 title = {Corpus-Based Diacritic Restoration for South Slavic Languages},
 url = {https://www.aclweb.org/anthology/L16-1573},
 year = {2016}
}

