@inproceedings{occhipinti-2024-complex,
title = "Complex Word Identification for {I}talian Language: A Dictionary{--}based Approach",
author = "Occhipinti, Laura",
booktitle = "Proceedings of the Sixth International Conference on Computational Linguistics in Bulgaria (CLIB 2024)",
month = sep,
year = "2024",
address = "Sofia, Bulgaria",
publisher = "Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences",
url = "https://aclanthology.org/2024.clib-1.12/",
pages = "119--129",
abstract = "Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models' predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="occhipinti-2024-complex">
<titleInfo>
<title>Complex Word Identification for Italian Language: A Dictionary–based Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Occhipinti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Conference on Computational Linguistics in Bulgaria (CLIB 2024)</title>
</titleInfo>
<originInfo>
<publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
<place>
<placeTerm type="text">Sofia, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models’ predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task.</abstract>
<identifier type="citekey">occhipinti-2024-complex</identifier>
<location>
<url>https://aclanthology.org/2024.clib-1.12/</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>119</start>
<end>129</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Complex Word Identification for Italian Language: A Dictionary–based Approach
%A Occhipinti, Laura
%S Proceedings of the Sixth International Conference on Computational Linguistics in Bulgaria (CLIB 2024)
%D 2024
%8 September
%I Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences
%C Sofia, Bulgaria
%F occhipinti-2024-complex
%X Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models’ predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task.
%U https://aclanthology.org/2024.clib-1.12/
%P 119-129
Markdown (Informal)
[Complex Word Identification for Italian Language: A Dictionary–based Approach](https://aclanthology.org/2024.clib-1.12/) (Occhipinti, CLIB 2024)
ACL