@inproceedings{kazmi-etal-2022-linguistically,
title = "Linguistically Motivated Features for Classifying Shorter Text into Fiction and Non-Fiction Genre",
author = "Kazmi, Arman and
Ranjan, Sidharth and
Sharma, Arpit and
Rajkumar, Rajakrishnan",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.77/",
pages = "922--937",
abstract = "This work deploys linguistically motivated features to classify paragraph-level text into fiction and non-fiction genre using a logistic regression model and infers lexical and syntactic properties that distinguish the two genres. Previous works have focused on classifying document-level text into fiction and non-fiction genres, while in this work, we deal with shorter texts which are closer to real-world applications like sentiment analysis of tweets. Going beyond simple POS tag ratios proposed in Qureshi et al.(2019) for document-level classification, we extracted multiple linguistically motivated features belonging to four categories: Lexical features, POS ratio features, Syntactic features and Raw features. For the task of short-text classification, a model containing 28 best-features (selected via Recursive feature elimination with cross-validation; RFECV) confers an accuracy jump of 15.56 {\%} over a baseline model consisting of 2 POS-ratio features found effective in previous work (cited above). The efficacy of the above model containing a linguistically motivated feature set also transfers over to another dataset viz, Baby BNC corpus. We also compared the classification accuracy of the logistic regression model with two deep-learning models. A 1D CNN model gives an increase of 2{\%} accuracy over the logistic Regression classifier on both corpora. And the BERT-base-uncased model gives the best classification accuracy of 97{\%} on Brown corpus and 98{\%} on Baby BNC corpus. Although both the deep learning models give better results in terms of classification accuracy, the problem of interpreting these models remains unsolved. In contrast, regression model coefficients revealed that fiction texts tend to have more character-level diversity and have lower lexical density (quantified using content-function word ratios) compared to non-fiction texts. Moreover, subtle differences in word order exist between the two genres, i.e., in fiction texts Verbs precede Adverbs (inter-alia)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kazmi-etal-2022-linguistically">
<titleInfo>
<title>Linguistically Motivated Features for Classifying Shorter Text into Fiction and Non-Fiction Genre</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Kazmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sidharth</namePart>
<namePart type="family">Ranjan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arpit</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajakrishnan</namePart>
<namePart type="family">Rajkumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work deploys linguistically motivated features to classify paragraph-level text into fiction and non-fiction genre using a logistic regression model and infers lexical and syntactic properties that distinguish the two genres. Previous works have focused on classifying document-level text into fiction and non-fiction genres, while in this work, we deal with shorter texts which are closer to real-world applications like sentiment analysis of tweets. Going beyond simple POS tag ratios proposed in Qureshi et al.(2019) for document-level classification, we extracted multiple linguistically motivated features belonging to four categories: Lexical features, POS ratio features, Syntactic features and Raw features. For the task of short-text classification, a model containing 28 best-features (selected via Recursive feature elimination with cross-validation; RFECV) confers an accuracy jump of 15.56 % over a baseline model consisting of 2 POS-ratio features found effective in previous work (cited above). The efficacy of the above model containing a linguistically motivated feature set also transfers over to another dataset viz, Baby BNC corpus. We also compared the classification accuracy of the logistic regression model with two deep-learning models. A 1D CNN model gives an increase of 2% accuracy over the logistic Regression classifier on both corpora. And the BERT-base-uncased model gives the best classification accuracy of 97% on Brown corpus and 98% on Baby BNC corpus. Although both the deep learning models give better results in terms of classification accuracy, the problem of interpreting these models remains unsolved. In contrast, regression model coefficients revealed that fiction texts tend to have more character-level diversity and have lower lexical density (quantified using content-function word ratios) compared to non-fiction texts. Moreover, subtle differences in word order exist between the two genres, i.e., in fiction texts Verbs precede Adverbs (inter-alia).</abstract>
<identifier type="citekey">kazmi-etal-2022-linguistically</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.77/</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>922</start>
<end>937</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Linguistically Motivated Features for Classifying Shorter Text into Fiction and Non-Fiction Genre
%A Kazmi, Arman
%A Ranjan, Sidharth
%A Sharma, Arpit
%A Rajkumar, Rajakrishnan
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F kazmi-etal-2022-linguistically
%X This work deploys linguistically motivated features to classify paragraph-level text into fiction and non-fiction genre using a logistic regression model and infers lexical and syntactic properties that distinguish the two genres. Previous works have focused on classifying document-level text into fiction and non-fiction genres, while in this work, we deal with shorter texts which are closer to real-world applications like sentiment analysis of tweets. Going beyond simple POS tag ratios proposed in Qureshi et al.(2019) for document-level classification, we extracted multiple linguistically motivated features belonging to four categories: Lexical features, POS ratio features, Syntactic features and Raw features. For the task of short-text classification, a model containing 28 best-features (selected via Recursive feature elimination with cross-validation; RFECV) confers an accuracy jump of 15.56 % over a baseline model consisting of 2 POS-ratio features found effective in previous work (cited above). The efficacy of the above model containing a linguistically motivated feature set also transfers over to another dataset viz, Baby BNC corpus. We also compared the classification accuracy of the logistic regression model with two deep-learning models. A 1D CNN model gives an increase of 2% accuracy over the logistic Regression classifier on both corpora. And the BERT-base-uncased model gives the best classification accuracy of 97% on Brown corpus and 98% on Baby BNC corpus. Although both the deep learning models give better results in terms of classification accuracy, the problem of interpreting these models remains unsolved. In contrast, regression model coefficients revealed that fiction texts tend to have more character-level diversity and have lower lexical density (quantified using content-function word ratios) compared to non-fiction texts. Moreover, subtle differences in word order exist between the two genres, i.e., in fiction texts Verbs precede Adverbs (inter-alia).
%U https://aclanthology.org/2022.coling-1.77/
%P 922-937
Markdown (Informal)
[Linguistically Motivated Features for Classifying Shorter Text into Fiction and Non-Fiction Genre](https://aclanthology.org/2022.coling-1.77/) (Kazmi et al., COLING 2022)
ACL