@inproceedings{popovic-2018-complex,
title = "Complex Word Identification Using Character n-grams",
author = "Popovi{\'c}, Maja",
editor = "Tetreault, Joel and
Burstein, Jill and
Kochmar, Ekaterina and
Leacock, Claudia and
Yannakoudakis, Helen",
booktitle = "Proceedings of the Thirteenth Workshop on Innovative Use of {NLP} for Building Educational Applications",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-0541",
doi = "10.18653/v1/W18-0541",
pages = "341--348",
abstract = "This paper investigates the use of character n-gram frequencies for identifying complex words in English, German and Spanish texts. The approach is based on the assumption that complex words are likely to contain different character sequences than simple words. The multinomial Naive Bayes classifier was used with n-grams of different lengths as features, and the best results were obtained for the combination of 2-grams and 4-grams. This variant was submitted to the Complex Word Identification Shared Task 2018 for all texts and achieved F-scores between 70{\%} and 83{\%}. The system was ranked in the middle range for all English texts, as third of fourteen submissions for German, and as tenth of seventeen submissions for Spanish. The method is not very convenient for the cross-language task, achieving only 59{\%} on the French text.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="popovic-2018-complex">
<titleInfo>
<title>Complex Word Identification Using Character n-grams</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Leacock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helen</namePart>
<namePart type="family">Yannakoudakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Orleans, Louisiana</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper investigates the use of character n-gram frequencies for identifying complex words in English, German and Spanish texts. The approach is based on the assumption that complex words are likely to contain different character sequences than simple words. The multinomial Naive Bayes classifier was used with n-grams of different lengths as features, and the best results were obtained for the combination of 2-grams and 4-grams. This variant was submitted to the Complex Word Identification Shared Task 2018 for all texts and achieved F-scores between 70% and 83%. The system was ranked in the middle range for all English texts, as third of fourteen submissions for German, and as tenth of seventeen submissions for Spanish. The method is not very convenient for the cross-language task, achieving only 59% on the French text.</abstract>
<identifier type="citekey">popovic-2018-complex</identifier>
<identifier type="doi">10.18653/v1/W18-0541</identifier>
<location>
<url>https://aclanthology.org/W18-0541</url>
</location>
<part>
<date>2018-06</date>
<extent unit="page">
<start>341</start>
<end>348</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Complex Word Identification Using Character n-grams
%A Popović, Maja
%Y Tetreault, Joel
%Y Burstein, Jill
%Y Kochmar, Ekaterina
%Y Leacock, Claudia
%Y Yannakoudakis, Helen
%S Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications
%D 2018
%8 June
%I Association for Computational Linguistics
%C New Orleans, Louisiana
%F popovic-2018-complex
%X This paper investigates the use of character n-gram frequencies for identifying complex words in English, German and Spanish texts. The approach is based on the assumption that complex words are likely to contain different character sequences than simple words. The multinomial Naive Bayes classifier was used with n-grams of different lengths as features, and the best results were obtained for the combination of 2-grams and 4-grams. This variant was submitted to the Complex Word Identification Shared Task 2018 for all texts and achieved F-scores between 70% and 83%. The system was ranked in the middle range for all English texts, as third of fourteen submissions for German, and as tenth of seventeen submissions for Spanish. The method is not very convenient for the cross-language task, achieving only 59% on the French text.
%R 10.18653/v1/W18-0541
%U https://aclanthology.org/W18-0541
%U https://doi.org/10.18653/v1/W18-0541
%P 341-348
Markdown (Informal)
[Complex Word Identification Using Character n-grams](https://aclanthology.org/W18-0541) (Popović, BEA 2018)
ACL
- Maja Popović. 2018. Complex Word Identification Using Character n-grams. In Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, pages 341–348, New Orleans, Louisiana. Association for Computational Linguistics.