@inproceedings{L16-1280,
 abstract = {We present a large, free, French corpus of online written conversations extracted from the Ubuntu platform's forums, mailing lists and IRC channels. The corpus is meant to support multi-modality and diachronic studies of online written conversations. We choose to build the corpus around a robust metadata model based upon strong principles, such as the "stand off" annotation principle. We detail the model, we explain how the data was collected and processed - in terms of meta-data, text and conversation - and we detail the corpus'contents through a series of meaningful statistics. A portion of the corpus - about 4,700 sentences from emails, forum posts and chat messages sent in November 2014 - is annotated in terms of dialogue acts and sentiment. We discuss how we adapted our dialogue act taxonomy from the DIT++ annotation scheme and how the data was annotated, before presenting our results as well as a brief qualitative analysis of the annotated data.
},
 address = {Portorož, Slovenia},
 author = {Nicolas Hernandez and Soufian Salim and Elizaveta Loginova Clouet},
 booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
 month = {May},
 pages = {1777--1783},
 publisher = {European Language Resources Association (ELRA)},
 title = {Ubuntu-fr: A Large and Open Corpus for Multi-modal Analysis of Online Written Conversations},
 url = {https://www.aclweb.org/anthology/L16-1280},
 year = {2016}
}

