{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Wassa2021.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "l5u7sgCmPNMg" }, "source": [ "

WASSA 2021 sumbissions notebook

\n", "\n", "This notebook contains the code used to generate the sumbission to the WASSA 2021 shared task for the prediction of empathy, distress and emotions in reaction from news stories.\n", "\n", "The paper that describes the submission is:\n", "\n", "G. Vettigli and A. Sorgente, “Empna at WASSA2021: A lightweight model for the prediction ofempathy,distress and emotions from reactions to news stories ”, in Proceedings of the Eleventh Workshop onComputational Approaches to Subjectivity, Sentiment and Social Media Analysis, Association forComputational Linguistics, 2021.\n", "\n", "Warning: the code the extract the features is not particularly efficient or well structured." ] }, { "cell_type": "markdown", "metadata": { "id": "im4VRjKgO575" }, "source": [ "

Data collection and environment setup

" ] }, { "cell_type": "code", "metadata": { "id": "UVNRUTMsHIMb" }, "source": [ "!wget https://drive.google.com/u/0/uc?id=1kkDi-3RxFnEpapXL4QXJK1hjzfy0Qdwc&export=download\n", "!unzip 'uc?id=1kkDi-3RxFnEpapXL4QXJK1hjzfy0Qdwc'\n", "\n", "!wget https://drive.google.com/u/0/uc?id=1cMFdOExy_MrXyOTAZdgm7U9PtOh509bL&export=download\n", "!unzip uc?id=1cMFdOExy_MrXyOTAZdgm7U9PtOh509bL\n", "!mkdir dev_features_labels_WASSA2021\n", "!mv messages_dev_features_ready_for_WS.tsv dev_features_labels_WASSA2021\n", "!mv messages_dev_sentencized_automatic_emotion_tags.tsv dev_features_labels_WASSA2021\n", "!mv goldstandard.tsv dev_features_labels_WASSA2021\n", "\n", "!wget https://drive.google.com/u/0/uc?id=1riRRo1e0_6jwD9v2C6yXwOip6mi9YbcL&export=download\n", "!unzip uc?id=1riRRo1e0_6jwD9v2C6yXwOip6mi9YbcL\n", "!mkdir test_features_labels_WASSA2021\n", "!mv messages_test_features_ready_for_WS.tsv test_features_labels_WASSA2021\n", "!mv messages_test_sentencized_automatic_emotion_tags.tsv test_features_labels_WASSA2021\n", "!mv gold_standard_test_EMO.tsv test_features_labels_WASSA2021\n", "\n", "!wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar\n", "!unrar e opinion-lexicon-English.rar\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/inquirer.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/concreteness.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/finance.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/kuperman.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/140-scores.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/mpqa.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/qwn-scores.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/qwn.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/qwn2.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/twitter-scores.json\n", "!wget https://raw.githubusercontent.com/buechel/socialsent/master/socialsent/data/lexicons/twitter.json\n", "\n", "!pip install afinn\n", "\n", "!wget http://wwbp.org/downloads/public_data/empathy.zip\n", "!wget http://wwbp.org/downloads/public_data/distress.zip\n", "!unzip empathy.zip\n", "!unzip distress.zip\n", "\n", "!pip install xgboost\n", "!pip install senticnet\n", "import nltk\n", "nltk.download('punkt')\n", "nltk.download('wordnet')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "PJBlbE9PPDjB" }, "source": [ "

Data import

" ] }, { "cell_type": "code", "metadata": { "id": "pYW6QngKHeD8" }, "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "nU-DWlabHlOv" }, "source": [ "train_data = pd.read_csv('train_features_labels_WASSA2021/messages_train_ready_for_WS.tsv', sep='\\t')\n", "dev_data = pd.read_csv('dev_features_labels_WASSA2021/messages_dev_features_ready_for_WS.tsv', sep='\\t')\n", "test_data = pd.read_csv('test_features_labels_WASSA2021/messages_test_features_ready_for_WS.tsv', sep='\\t')\n", "targets_dev = pd.read_csv('dev_features_labels_WASSA2021/goldstandard.tsv', sep='\\t', names=['empathy','distress','emotion'])\n", "dev_data['empathy'] = targets_dev.empathy.values\n", "dev_data['distress'] = targets_dev.empathy.values\n", "dev_data['emotion'] = targets_dev.emotion.values\n", "#train_data = pd.concat([train_data, dev_data], axis=1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "n2U6rg_rnbfe" }, "source": [ "train_data.shape, dev_data.shape, test_data.shape" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "nlbSRPh_7kEj" }, "source": [ "

Add lexicon-based features

" ] }, { "cell_type": "code", "metadata": { "id": "6jv7eyZb2xvo" }, "source": [ "from senticnet.senticnet import SenticNet\n", "sn = SenticNet()\n", "\n", "def sentic_net_moodtags(text):\n", " def get_polaritiy_values(word):\n", " try:\n", " concept_info = sn.concept(word)\n", " except:\n", " concept_info = None\n", " \n", " return concept_info\n", "\n", " tokens = text.split()\n", " moodtags = []\n", " for tok in tokens:\n", " sn_out = get_polaritiy_values(tok)\n", " if sn_out:\n", " moodtags += sn_out['moodtags']\n", " return ' '.join(moodtags)\n", "\n", "train_data['moodtags'] = train_data.essay.apply(sentic_net_moodtags)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZTxWz4-p7Ifd" }, "source": [ "from afinn import Afinn\n", "afinn = Afinn()\n", "\n", "def sentic_net_features(text, ptype, aggregator=np.mean):\n", " def get_polaritiy_values(word):\n", " try:\n", " concept_info = sn.concept(word)\n", " except:\n", " concept_info = None\n", " \n", " return concept_info\n", "\n", " tokens = text.split()\n", " polaryties = [get_polaritiy_values(tok) for tok in tokens]\n", "\n", " if ptype == \"polarity_value\": \n", " polarity_value = aggregator([float(p['polarity_value']) for p in polaryties if p])\n", " return polarity_value\n", " if ptype == \"attention\":\n", " attention = aggregator([float(p['sentics']['attention']) for p in polaryties if p])\n", " return attention\n", " if ptype == \"temper\":\n", " temper = aggregator([float(p['sentics']['temper']) for p in polaryties if p])\n", " return temper\n", " if ptype == \"attitude\":\n", " temper = aggregator([float(p['sentics']['attitude']) for p in polaryties if p])\n", " return temper\n", " if ptype == \"pleasantness\":\n", " #import pdb; pdb.set_trace()\n", " pleasantness = aggregator([float(p['sentics']['pleasantness']) for p in polaryties if p])\n", " return pleasantness\n", " if ptype == \"sensitivity\":\n", " sensitivity = aggregator([float(p['sentics']['sensitivity']) for p in polaryties if p])\n", " return sensitivity\n", " \n", " return None\n", "\n", "def emptymax(x):\n", " if len(x) == 0:\n", " return 0\n", " return np.max(x)\n", "\n", "def emptymin(x):\n", " if len(x) == 0:\n", " return 0\n", " return np.min(x)\n", "\n", "def read_liu_lexicon(fname='positive-words.txt'):\n", " lexicon = {}\n", " with open(fname, encoding=\"ISO-8859-1\") as fpos:\n", " for line in fpos.readlines():\n", " if line != '' and not line.startswith(';'):\n", " lexicon[line.strip()] = True\n", " return lexicon\n", "\n", "bl_positive_words = read_liu_lexicon(fname='positive-words.txt')\n", "bl_negative_words = read_liu_lexicon(fname='negative-words.txt')\n", "\n", "import json\n", "inquirer = json.load(open('inquirer.json', 'r'))\n", "finance = json.load(open('finance.json', 'r'))\n", "concreteness = json.load(open('concreteness.json', 'r'))\n", "kuperman = json.load(open('kuperman.json', 'r'))\n", "scores140 = json.load(open('140-scores.json', 'r'))\n", "mpqa = json.load(open('mpqa.json', 'r'))\n", "qwn_scores = json.load(open('qwn-scores.json', 'r'))\n", "qwn = json.load(open('qwn.json', 'r'))\n", "qwn2 = json.load(open('qwn2.json', 'r'))\n", "twitter_scores = json.load(open('twitter-scores.json', 'r'))\n", "twitter = json.load(open('twitter.json', 'r'))\n", "\n", "\n", "distress_lexicon = pd.read_csv('distress_lexicon.txt').set_index('word').to_dict()['rating']\n", "empathy_lexicon = pd.read_csv('empathy_lexicon.txt').set_index('word').to_dict()['rating']\n", "\n", "import string\n", "\n", "\n", "def add_features(train):\n", " train['afinn'] = train.essay.apply(afinn.score)\n", " train['positive_words'] = train.essay.apply(lambda essay: np.sum([w in essay for w in bl_positive_words.keys()]))\n", " train['negative_words'] = train.essay.apply(lambda essay: np.sum([w in essay for w in bl_negative_words.keys()]))\n", "\n", " strip_punct = lambda essay : ''.join([c for c in essay if not c in string.punctuation])\n", " train_data['essay_nopunct'] = train_data.essay.apply(strip_punct)\n", " train_data['finance'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([finance.get(w, np.nan) for w in essay.split()]))\n", " train_data['concreteness'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([concreteness.get(w, np.nan) for w in essay.split()]))\n", " train_data['inquirer'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([inquirer.get(w, np.nan) for w in essay.split()]))\n", " train_data['kuperman'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([kuperman.get(w, np.nan) for w in essay.split()]))\n", "\n", " train_data['scores140'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([scores140.get(w, np.nan) for w in essay.split()]))\n", " train_data['mpqa'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([mpqa.get(w, np.nan) for w in essay.split()]))\n", " train_data['qwn_scores'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([qwn_scores.get(w, np.nan) for w in essay.split()]))\n", " train_data['qwn'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([qwn.get(w, np.nan) for w in essay.split()]))\n", " train_data['qwn2'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([qwn2.get(w, np.nan) for w in essay.split()]))\n", " train_data['twitter_scores'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([twitter_scores.get(w, np.nan) for w in essay.split()]))\n", " train_data['twitter'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([twitter.get(w, np.nan) for w in essay.split()]))\n", "\n", " train_data['kuperman'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([kuperman.get(w, np.nan) for w in essay.split()]))\n", "\n", " #train_data['empathy_lexicon'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([empathy_lexicon.get(w, np.nan) for w in essay.split()]))\n", " #train_data['distress_lexicon'] = train_data['essay_nopunct'].apply(lambda essay: np.nansum([distress_lexicon.get(w, np.nan) for w in essay.split()]))\n", "\n", " train['polarity_mean'] = train.essay.apply(sentic_net_features, \n", " ptype='polarity_value')\n", " train['polarity_max'] = train.essay.apply(sentic_net_features,\n", " ptype='polarity_value',\n", " aggregator=emptymax)\n", " train['polarity_min'] = train.essay.apply(sentic_net_features,\n", " ptype='polarity_value',\n", " aggregator=emptymin)\n", "\n", " train['temper_mean'] = train.essay.apply(sentic_net_features, \n", " ptype='temper')\n", " train['temper_max'] = train.essay.apply(sentic_net_features,\n", " ptype='temper',\n", " aggregator=emptymax)\n", " train['temper_min'] = train.essay.apply(sentic_net_features,\n", " ptype='temper',\n", " aggregator=emptymin)\n", "\n", " train['sensitivity_mean'] = train.essay.apply(sentic_net_features, \n", " ptype='sensitivity')\n", " train['sensitivity_max'] = train.essay.apply(sentic_net_features,\n", " ptype='sensitivity',\n", " aggregator=emptymax)\n", " train['sensitivity_min'] = train.essay.apply(sentic_net_features,\n", " ptype='sensitivity',\n", " aggregator=emptymin)\n", " \n", " train['attitude_mean'] = train.essay.apply(sentic_net_features, \n", " ptype='attitude')\n", " train['attitude_max'] = train.essay.apply(sentic_net_features,\n", " ptype='attitude',\n", " aggregator=emptymax)\n", " train['attitude_min'] = train.essay.apply(sentic_net_features,\n", " ptype='attitude',\n", " aggregator=emptymin)\n", "\n", "add_features(train_data)\n", "add_features(dev_data)\n", "add_features(test_data)\n", "train_data.head()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "B674lrHjHrEN" }, "source": [ "numerical_features = ['gender', 'education', 'race', \n", " 'age', 'income', 'personality_conscientiousness',\n", " 'personality_openess', 'personality_extraversion',\n", " 'personality_agreeableness', 'personality_stability',\n", " 'iri_perspective_taking', 'iri_personal_distress', 'iri_fantasy',\n", " 'iri_empathatic_concern', \n", " ]\n", "\n", "for c in numerical_features:\n", " train_data[c] = pd.to_numeric(train_data[c])\n", "train_data.empathy = pd.to_numeric(train_data.empathy)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "p30E-m3_7u2i" }, "source": [ "

Model for empathy

" ] }, { "cell_type": "code", "metadata": { "id": "USyyjjmrEFIl" }, "source": [ "from sklearn.pipeline import Pipeline, FeatureUnion\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import KFold\n", "from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler\n", "from sklearn.linear_model import LinearRegression, Ridge\n", "from nltk import word_tokenize \n", "from nltk.stem import WordNetLemmatizer \n", "from nltk.stem.snowball import PorterStemmer, SnowballStemmer\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.metrics import make_scorer\n", "from scipy.stats import pearsonr\n", "\n", "def pearson_fun(y_true, y_pred):\n", " return pearsonr(y_true, y_pred)[0]\n", "\n", "pearson_score = make_scorer(pearson_fun)\n", "\n", "\n", "class LemmaTokenizerPorter(object):\n", " def __init__(self):\n", " self.wnl = PorterStemmer()\n", " def __call__(self, articles):\n", " return [self.wnl.stem(t) for t in word_tokenize(articles)]\n", "\n", "class LemmaTokenizerSnowBall(object):\n", " def __init__(self):\n", " self.wnl = SnowballStemmer('english')\n", " def __call__(self, articles):\n", " return [self.wnl.stem(t) for t in word_tokenize(articles)]\n", "\n", "class LemmaTokenizerWN(object):\n", " def __init__(self):\n", " self.wnl = WordNetLemmatizer()\n", " def __call__(self, articles):\n", " return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]\n", "\n", "kfold = KFold(n_splits=5, shuffle=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OTTkG4rVH_Au" }, "source": [ "sentence_pipe = Pipeline([('sentence', FunctionTransformer(lambda x: x.essay,)),\n", " ('vect', CountVectorizer(#stop_words='english',\n", " max_df=0.8, \n", " tokenizer=LemmaTokenizerPorter(),\n", " preprocessor=lambda x: x.lower(),\n", " lowercase=True,\n", " #max_features=1500,\n", " ngram_range=(1, 3))),\n", " ('tfidf', TfidfTransformer(sublinear_tf=True,)),\n", " #('feature_selection', SelectKBest(f_regression, k=50))\n", " ])\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "moodtags_pipe = Pipeline([('moodtags', FunctionTransformer(lambda x: x.moodtags)),\n", " ('vectorizer', CountVectorizer(stop_words=None))])\n", "\n", "emotion_pipe = Pipeline([('emotion', FunctionTransformer(lambda x: x.emotion)),\n", " ('vectorizer', CountVectorizer(stop_words=None))])\n", "\n", "numerical_pipe = Pipeline([('numerical', FunctionTransformer(lambda x: x[numerical_features].values)),\n", " ('scale', StandardScaler())])\n", "\n", "s_numerical = ['polarity_mean', 'polarity_max', 'polarity_min',\n", " 'temper_mean', 'temper_max', 'temper_min',\n", " 'sensitivity_mean', 'sensitivity_max', 'sensitivity_min',\n", " 'attitude_mean', 'attitude_max', 'attitude_min',\n", " 'positive_words', 'negative_words', 'afinn',\n", " 'finance', 'concreteness', 'inquirer', 'kuperman',\n", " 'scores140', 'mpqa', 'qwn_scores', 'qwn',\n", " 'qwn2', 'twitter_scores', 'twitter',\n", " 'empathy_lexicon', 'distress_lexicon'\n", " ]\n", "\n", "senticnet_pipe = Pipeline([('s_numerical', FunctionTransformer(lambda x: x[numerical_features].values)),\n", " ('scale', StandardScaler())])\n", "\n", "combined_features = FeatureUnion([('sentence', sentence_pipe), \n", " ('numerical', numerical_pipe),\n", " ('senticpipe', senticnet_pipe)\n", " ])\n", "\n", "reg = LinearRegression()\n", "\n", "emp_empathy = Pipeline([\n", " ('vect', combined_features),\n", " ('predictor', reg)\n", "])\n", "\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring=pearson_score,\n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "emp_empathy.fit(train_data, train_data.empathy)\n", "print('held out : ', pearson_fun(emp_empathy.predict(dev_data), dev_data.empathy))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IMD_5PvGYHNA" }, "source": [ "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='neg_mean_squared_error', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='neg_mean_absolute_error', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='r2', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "355qJcrhgryd" }, "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = {\n", " 'vect__sentence__vect__max_df': [.7, .8, .9],\n", " 'vect__sentence__vect__max_features': [10, 500, 1000, None],\n", " 'vect__sentence__vect__tokenizer': [LemmaTokenizerSnowBall(), \n", " LemmaTokenizerPorter(),\n", " LemmaTokenizerWN(),\n", " None],\n", " 'vect__sentence__vect__ngram_range': [(1, 2), (1, 3)],\n", "}\n", "\n", "search = GridSearchCV(emp_empathy, param_grid, n_jobs=-1,\n", " #scoring='neg_root_mean_squared_error',\n", " scoring=pearson_score,\n", " cv=kfold,\n", " verbose=2)\n", "\n", "search.fit(train_data, train_data.empathy)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)\n", "print(search.best_estimator_)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "UceLe6rT70t1" }, "source": [ "

Model for distress

" ] }, { "cell_type": "code", "metadata": { "id": "izTynW1cEXL-" }, "source": [ "from sklearn.feature_selection import SelectKBest, f_regression\n", "\n", "sentence_pipe = Pipeline([('sentence', FunctionTransformer(lambda x: x.essay,)),\n", " ('vect', CountVectorizer(#stop_words='english',\n", " max_df=0.7,\n", " tokenizer=LemmaTokenizerPorter(),\n", " #preprocessor=lambda x: x.lower(),\n", " lowercase=True,\n", " #max_features=10,\n", " ngram_range=(1, 3))),\n", " ('tfidf', TfidfTransformer(sublinear_tf=True,)),\n", " #('feature_selection', SelectKBest(f_regression, k=100))\n", " ])\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "moodtags_pipe = Pipeline([('moodtags', FunctionTransformer(lambda x: x.moodtags)),\n", " ('vectorizer', TfidfVectorizer(stop_words=None))])\n", "\n", "numerical_pipe = Pipeline([('numerical', FunctionTransformer(lambda x: x[numerical_features].values)),\n", " ('scale', StandardScaler())])\n", "\n", "combined_features = FeatureUnion([('sentence', sentence_pipe),\n", " ('numerical', numerical_pipe),\n", " ('senticpipe', senticnet_pipe),\n", " ])\n", "\n", "\n", "reg = LinearRegression()\n", "\n", "emp_distress = Pipeline([\n", " ('vect', combined_features),\n", " ('predictor', reg)\n", "])\n", "\n", "scores = cross_val_score(emp_distress, train_data, train_data.distress, \n", " scoring=pearson_score,\n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "emp_distress.fit(train_data, train_data.distress)\n", "print('held out : ', pearson_fun(emp_distress.predict(dev_data), dev_data.distress))\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='neg_mean_squared_error', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='neg_mean_absolute_error', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())\n", "\n", "scores = cross_val_score(emp_empathy, train_data, train_data.empathy, \n", " scoring='r2', \n", " cv=kfold, verbose=1, n_jobs=-1)\n", "print('x-valid :', -scores.mean(), scores.std())" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "t6r68Cl8JBcO" }, "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = {\n", " 'vect__sentence__vect__max_df': [.7, .8, .9],\n", " 'vect__sentence__vect__max_features': [10, 500, 1000, None],\n", " 'vect__sentence__vect__tokenizer': [LemmaTokenizerSnowBall(), \n", " LemmaTokenizerPorter(),\n", " LemmaTokenizerWN(),\n", " None],\n", " 'vect__sentence__vect__ngram_range': [(1, 2), (1, 3)],\n", "}\n", "\n", "search = GridSearchCV(emp_distress, param_grid, n_jobs=-1,\n", " #scoring='neg_root_mean_squared_error',\n", " scoring=pearson_score,\n", " cv=kfold,\n", " verbose=2)\n", "\n", "search.fit(train_data, train_data.distress)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)\n", "print(search.best_estimator_)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "308Z9uu_4ZRa" }, "source": [ "submission_df = pd.DataFrame()\n", "submission_df['empathy'] = emp_empathy.predict(test_data)\n", "submission_df['distress'] = emp_distress.predict(test_data)\n", "submission_df.to_csv('predictions_EMP.tsv', index=False, header=False, sep='\\t')\n", "!zip submission_emp_only.zip predictions_EMP.tsv" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S5OAfzTE77_l" }, "source": [ "

Model for emotions

" ] }, { "cell_type": "code", "metadata": { "id": "NvloQo3CJSSl" }, "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "\n", "lenc = LabelEncoder()\n", "train_data['emotion_enc'] = lenc.fit_transform(train_data.emotion)\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "clf = LogisticRegression(class_weight='balanced', multi_class='ovr',\n", " solver='liblinear', C=0.9)\n", "\n", "sentence_pipe = Pipeline([('sentence', FunctionTransformer(lambda x: x.essay,)),\n", " ('vect', CountVectorizer(#stop_words='english',\n", " max_df=0.9,\n", " tokenizer=LemmaTokenizerPorter(),\n", " #preprocessor=lambda x: x.lower(),\n", " lowercase=True,\n", " max_features=1000,\n", " ngram_range=(1, 2))),\n", " ('tfidf', TfidfTransformer(sublinear_tf=True,)),\n", " #('feature_selection', SelectKBest(f_regression, k=100))\n", " ])\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "moodtags_pipe = Pipeline([('moodtags', FunctionTransformer(lambda x: x.moodtags)),\n", " ('vectorizer', TfidfVectorizer(stop_words=None))])\n", "\n", "numerical_pipe = Pipeline([('numerical', FunctionTransformer(lambda x: x[numerical_features].values)),\n", " ('scale', StandardScaler())])\n", "\n", "combined_features = FeatureUnion([('sentence', sentence_pipe), \n", " #('moodtags', moodtags_pipe), \n", " ('numerical', numerical_pipe),\n", " ('senticpipe', senticnet_pipe),\n", " ])\n", "\n", "mpo_clf = Pipeline([\n", " ('vect', combined_features),\n", " ('todense', FunctionTransformer(lambda x: x.todense())),\n", " ('predictor', clf)\n", "])\n", "\n", "kfold = KFold(n_splits=5, shuffle=True)\n", "scores = cross_val_score(mpo_clf, train_data, train_data.emotion_enc, \n", " scoring='f1_macro', cv=kfold, verbose=1)\n", "\n", "print('xvalid : ', scores.mean(), scores.std())\n", "from sklearn.metrics import f1_score\n", "mpo_clf.fit(train_data, train_data.emotion_enc)\n", "print('held out:', f1_score(lenc.transform(dev_data.emotion),\n", " mpo_clf.predict(dev_data), average='micro'))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6Xjl31bQw5AA" }, "source": [ "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 23.2s finished\n", "xvalid : 0.4290322580645161 0.017960530202677488\n", "held out: 0.4370370370370371" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cyxuzRnTZvcX" }, "source": [ "from sklearn.metrics import classification_report, confusion_matrix\n", "print(classification_report(lenc.transform(dev_data.emotion),\n", " mpo_clf.predict(dev_data),\n", " target_names=lenc.classes_.tolist()))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "0O1ufrbRKed2" }, "source": [ "plt.pcolor(mpo_clf.named_steps['predictor'].coef_, cmap='Blues')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kMxlwmIeGSNA" }, "source": [ "colors = {'sadness': 'C0', 'neutral': 'C1', 'fear': 'C2', 'anger': 'C3',\n", " 'disgust': 'C4', 'surprise': 'C5', 'joy': 'C6'}\n", "\n", "pd.plotting.scatter_matrix(train_data[['positive_words',\n", " 'negative_words', 'finance', 'concreteness', 'inquirer',\n", " 'kuperman', 'scores140', 'mpqa',\n", " 'qwn_scores', 'qwn', 'qwn2',\n", " 'twitter_scores', 'twitter']],\n", " figsize=(25, 25),\n", " color=[colors[e] for e in train_data.emotion]\n", ");" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yTB8VQixeOeG" }, "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = {\n", " 'vect__sentence__vect__max_df': [.9],\n", " 'vect__sentence__vect__max_features': [500],\n", " 'vect__sentence__vect__tokenizer': [LemmaTokenizerWN(),\n", " None],\n", " 'vect__sentence__vect__ngram_range': [(1, 2)]\n", " \n", "}\n", "\n", "search = GridSearchCV(mpo_clf, param_grid, n_jobs=-1,\n", " scoring='f1_micro',\n", " cv=kfold,\n", " verbose=2)\n", "\n", "search.fit(train_data, train_data.emotion_enc)\n", "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)\n", "print(search.best_estimator_)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "m_dqeEUfPQg0" }, "source": [ "submission_df = pd.DataFrame()\n", "submission_df['emotion'] = lenc.inverse_transform(mpo_clf.predict(test_data))\n", "submission_df.to_csv('predictions_EMO.tsv', index=False, header=False, sep='\\t')\n", "!zip submission_both.zip predictions_EMP.tsv predictions_EMO.tsv" ], "execution_count": null, "outputs": [] } ] }