# -*- coding: utf-8 -*- """ Reproduces the analyses for 2019 NAACL submission titled "What just happened? Evaluating retrofitted distributional vectors" Dmetri Hayes 2019-04-04 @author: dmetri """ import os import gzip import math import numpy as np import sys from scipy.spatial.distance import cosine from scipy.stats import spearmanr, pearsonr, hmean from sklearn.metrics import mean_squared_error as mse import re import io import pandas as pd from copy import deepcopy import seaborn as sns import matplotlib.pyplot as plt from gensim.models import KeyedVectors SYS_VERSION = sys.version_info[0] print('Using Python %d' % SYS_VERSION) # set the open function if SYS_VERSION == 3: open_file = open elif SYS_VERSION == 2: open_file = io.open ROOT = os.getcwd() # global paths VEC_ROOT = ROOT + '/vectors/' #VEC_ROOT = ROOT + '/../../data/word_vectors/' PATH_LEX = ROOT + '/lexicons/' PATH_SIM = ROOT + '/similarity/' PATH_OUT_FIG = ROOT + '/output/' # path VEC_PATH = os.path.join(os.getcwd(), VEC_ROOT) # vector keys VEC_KEYS = ['SG', 'GloVe', 'NB', 'GC', 'Multi'] SG_KEY = 'SG_FULL' # current hack to allow this code to be stand-alone #SG_KEY = 'SG' # XXX: uncomment this to use the original word2vec vectors # word2vec/skipGram constants SG_PATH_FULL = VEC_PATH + 'GoogleNews-vectors-negative300.bin' SG_PATH_EVAL = VEC_PATH + 'sg_eval.txt' SG_CHAR_MAP = {'-':''} SG_WORD_MAP = {'harbour':'harbor', 'colour':'color', 'grey':'gray', 'theatre':'theater'} SG_LOWER = False # GloVe constants GLOVE_PATH_FULL = VEC_PATH + 'glove.6B.300d.txt' GLOVE_PATH_EVAL = VEC_PATH + 'glove_eval.txt' GLOVE_CHAR_MAP = {} GLOVE_WORD_MAP = {} GLOVE_LOWER = True # NumberBatch constants NB_PATH_FULL = VEC_PATH + 'numberbatch-en-17.06.txt.gz' NB_PATH_EVAL = VEC_PATH + 'nb_eval.txt' NB_CHAR_MAP = {'-':'_', ' ':'_'} NB_WORD_MAP = {} NB_LOWER = True # GC constants GC_PATH_FULL = VEC_PATH + 'globalContext.txt' GC_PATH_EVAL = VEC_PATH + 'gc_eval.txt' GC_CHAR_MAP = {} GC_WORD_MAP = {} GC_LOWER = True # Multi constants MULTI_PATH_FULL = VEC_PATH + 'de-projected-en-512.txt.gz' MULTI_PATH_EVAL = VEC_PATH + 'multi_eval.txt' MULTI_CHAR_MAP = {} MULTI_WORD_MAP = {} MULTI_LOWER = True #### DO NOT ALTER BELOW WORD2VEC_KEY = SG_KEY # alias for SG_KEY # combine the full paths PATH_FULL_DICT = {'NB': NB_PATH_FULL, 'GloVe': GLOVE_PATH_FULL, 'SG': SG_PATH_FULL, 'GC': GC_PATH_FULL, 'Multi': MULTI_PATH_FULL} # combine the eval paths PATH_EVAL_DICT = {'NB': NB_PATH_EVAL, 'GloVe': GLOVE_PATH_EVAL, 'SG': SG_PATH_EVAL, 'GC': GC_PATH_EVAL, 'Multi': MULTI_PATH_EVAL} # combine the character maps CHAR_MAP_DICT = {'NB': NB_CHAR_MAP, 'GloVe': GLOVE_CHAR_MAP, 'SG': SG_CHAR_MAP, 'GC': GC_CHAR_MAP, 'Multi': MULTI_CHAR_MAP} # combine the word maps WORD_MAP_DICT = {'NB': NB_WORD_MAP, 'GloVe': GLOVE_WORD_MAP, 'SG': SG_WORD_MAP, 'GC': GC_WORD_MAP, 'Multi': MULTI_WORD_MAP} # combine the lower flags VEC_LOWER_DICT = {'NB': NB_LOWER, 'GloVe': GLOVE_LOWER, 'SG': SG_LOWER, 'GC': GC_LOWER, 'Multi': MULTI_LOWER} # compile numbers IS_NUMBER = re.compile(r'\d+.*') SPECIAL_WORDS = ['---num---', '---punc---'] def get_word_from_label(label): ''' Retrieves the word from a label of the form "PREFIX@WORD". Args: label (str): the label of the grouping Returns: the word, if the label is of the form PREFIX.WORD if no such word is found, the label is returned ''' items = re.findall('.*[@](.*)', label) if len(items) == 1: return items[0] elif len(items) > 1: raise ValueError('Labels must be of the form "PREFIX@WORD" or "WORD"') else: return label def read_lines(filename): ''' Reads in the lines of a file. Args: filename (str): name of the file to be loaded Returns: a list of strings, the lines of the text ''' with open_file(filename, 'r', encoding='utf8') as f: lines = f.read().strip().split('\n') return lines def processAnnoText(words, version): ''' Preprocesses the annotation text from FrameNet lexical units. Args: words (list of str): a list of words to be processed version (float): the version of FrameNet Returns: a list of strings, the words without parentheses, underscores, brackets or spaces ''' if version == 1.5: # remove parentheses post = [re.sub('\(.*\)','', w).strip() for w in words] # if an underscore is at the end of a word, just remove it post = [re.sub('_$','', p).strip() for p in post] # skip words which have underscores or spaces (i.e. multiword compound) post = [p for p in post if not any(x in p for x in ['_', ' '])] post = list(set(post)) elif version in [1.7, 1.71]: # remove parentheses post = [re.sub('\(.*\)','', w).strip() for w in words] # remove brackets post = [re.sub('\[.*\]','', p).strip() for p in post] # skip words which have spaces (i.e. multiword compound) post = [p.strip() for p in post if ' ' not in p] post = list(set(post)) else: raise ValueError('Version %s not supported' % version) return post def norm_word(word_or_label): ''' Normalizes a word or label by neutralizing punctuation and numbers, as well as lowering the word. Args: word_or_label (str): a preprocessed string Returns: a processed word ''' # check if the item is a label (as opposed to a regular word) items = re.findall('.*[@](.*)', word_or_label) if len(items) == 1: prefix = re.findall('(.*@).*', word_or_label) word = items[0] elif len(items) > 1: raise ValueError('Labels must be of the form "PREFIX@WORD" or "WORD"') else: prefix = '' word = word_or_label if IS_NUMBER.search(word.lower()): return '---num---' elif re.sub(r'\W+', '', word) == '': return '---punc---' else: # return the possibly recovered label return prefix + word.lower() def get_unique_vals(d, include_keys=True): ''' Returns the unique values from a dictionary mapping a string to a list of strings. Args: d (dict from str to list of str): the dictionary include_keys (bool, optional): if True, the keys of the dictionary are included in the values Returns: a list of str, the unique vals from the dictionary ''' temp = set() for v in d.values(): temp.update(v) if include_keys: temp.update(d.keys()) vals = list(temp) return vals ## VECTORS def read_word_vecs_legacy(filename): ''' Reads a set of word vectors and normalizes them. (the original code from Faruqui et al. 2015) ''' wordVectors = {} if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r') else: fileObject = open(filename, 'r') for line in fileObject: line = line.strip().lower() word = line.split()[0] wordVectors[word] = np.zeros(len(line.split())-1, dtype=float) for index, vecVal in enumerate(line.split()[1:]): wordVectors[word][index] = float(vecVal) ''' normalize weight vector ''' wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6) sys.stderr.write("Vectors read from: "+filename+" \n") return wordVectors def read_word_vecs(filename, doLower=False, normalize=False): ''' Reads a set of word vectors. Args: filename (str): the file name of the word vectors to be loaded normalize (bool, optional): if True, normalizes the word vectors Returns: a dictionary of strings to their Numpy arrays ''' wordVectors = {} if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r') isBinary = True else: fileObject = open_file(filename, 'r', encoding='utf8') isBinary = False for line in fileObject: line = line.strip() # redid this so it doesn't lower the values if doLower: line = line.lower() items = line.split() if isBinary: # decode into ascii word = items[0].decode() else: word = items[0] if normalize: wordVectors[word] = np.zeros(len(line.split())-1, dtype=float) for index, vecVal in enumerate(line.split()[1:]): wordVectors[word][index] = float(vecVal) # normalize the weight vector wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6) else: wordVectors[word] = np.array([float(i) for i in items[1:]]) return wordVectors def get_word2vec(path, **kwargs): ''' A helper function which loads word2vec using the `gensim` module Args: path (str): path to the word vectors kwargs: Additional arguments that get passed to `load_word2vec_format` Returns: a `` ''' vecs = KeyedVectors.load_word2vec_format(path, **kwargs) return vecs def load_word_vecs(filenames, keys=None, legacy=False, lowerMap=None, normalize=False, word2vec_key=None, verbose=True): ''' Loads word vector files as a dictionary. Args: filenames (list of str): list of file names to load keys (list of str, optional): list of dictionary keys legacy (bool, optional): if True, uses the original retrofitting code lowerMap (dict from str to bool, optional): a dictionary mapping vector names to whether or not the words should be lowered before the evaluation normalize (bool, optional): if True, normalizes the word vectors word2vec_key (str, optional): key of the word2vec word vectors, which are loaded with `get_word2vec` verbose (bool, optional): if True, prints the names of the word vectors as they are loaded Returns: a dictionary mapping the name of each set of word vectors to those word vectors ''' if keys == None: keys = filenames vectors = {} for i in range(len(filenames)): f = filenames[i] if legacy: vecs = read_word_vecs_legacy(f+'.txt') else: if lowerMap is not None: doLower = lowerMap[keys[i]] else: doLower = False if keys[i] == word2vec_key: vecs = get_word2vec(f, binary=True) else: vecs = read_word_vecs(f, doLower=doLower, normalize=normalize) vectors[keys[i]] = vecs if verbose: print('%s loaded' % keys[i]) return vectors def read_lexicon_legacy(filename): ''' Reads word relations as a dictionary. (the original code from Faruqui et al. 2015) ''' lexicon = {} for line in open(filename, 'r'): words = line.lower().strip().split() lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]] return lexicon def read_lexicon(filename_or_lines, labeled=False, overwriteVals=False, doLabel=True): ''' Reads word relations as a dictionary. Args: filename_or_lines (str or list of str): if a string, the file path of the lines to be loaded. if a list, the lines defining the word relations labeled (bool, optional): specifies whether the word relations begin with a group label. By default the lines of the semantic resource are assumed to begin with a word, not a label. overwriteVals (bool, optional): if True, allows the entries of the dictionary to be overwritten. Only relevant if doLabel=False, otherwise overwriteVals is set to True doLabel (bool, optional): if True, labels the relations as PREFIX@WORD. Previous labels (i.e. REVENGE) are ignored. Returns: a dictionary mapping each word to its neighbors according to the semantic resource ''' if type(filename_or_lines) == str: lines = read_lines(filename_or_lines) elif type(filename_or_lines) == list: lines = filename_or_lines # if there were already group labels, get rid of them if labeled: lines = [' '.join(l.split()[1:]) for l in lines] # label lines? if doLabel: lines = label_lines(lines) if overwriteVals == False: print('WARNING: overwriteVals overwritten to True') overwriteVals = True lexicon = {} for line in lines: words = line.lower().strip().split() if len(words) > 0: # ensure compatibility with lines of the form PREFIX@WORD key = words[0] first_word = get_word_from_label(key) key_clean = norm_word(first_word) if not overwriteVals: if (key_clean in lexicon) & (key_clean not in SPECIAL_WORDS): msg = '"%s" already included in the dictionary' % key_clean raise KeyError(msg) lexicon[key] = [norm_word(word) for word in words[1:]] return lexicon def label_lines(filename_or_lines): ''' Returns labeled lines of a semantic resource of the form PREFIX@WORD. Args: filename_or_lines (str or list of str): if a string, the file path to the unlabeled semantic resource. if a list, the lines of the unlabeled semantic resource Returns: the lines of the semantic resource with the first word of each line replaced by PREFIX@WORD ''' if type(filename_or_lines) == str: lines = read_lines(filename_or_lines) elif type(filename_or_lines) == list: lines = filename_or_lines else: raise ValueError('filename_or_lines must be of type str or list') # keep count of each of the words count = {} new_lines = [] # loop through the lines for l in lines: words = l.split() if len(words) > 0: first_word = words[0] other_words = ' '.join(words[1:]) if first_word not in count: myCount = 1 count[first_word] = myCount # set count to 1 else: myCount = count[first_word] + 1 # increment count count[first_word] = myCount new = u'{}@{}'.format(myCount, first_word) + ' ' + other_words new_lines.append(new) return new_lines def read_compact_lexicon(filename_or_lines, sep='\t'): ''' Reads groupings of word relations as a dictionary. Args: filename_or_lines (str or list of str): if a string, the file path of the lines to be loaded. if a list, the lines defining the word relations Returns: a dictionary mapping each group label to its neighbors according to the semantic resource ''' if type(filename_or_lines) == str: lines = read_lines(filename_or_lines) elif type(filename_or_lines) == list: lines = filename_or_lines label_to_items = {l.split(sep)[0] : list(set(l.split(sep)[1:])) \ for l in lines} return label_to_items def load_lexicons(filenames, labeled=None, keys=None, legacy=False, overwriteVals=True, doLabel=True, verbose=True): ''' Loads the retrofitting semantic lexicons as a dictionary. Args: filenames (list of str): list of file names to load keys (list of str, optional): list of dictionary keys labeled (list of bool, optional): specifies for each semantic resource, whether the word relations begin with a group label. By default the lines of a semantic resource are assumed to begin with a word, not a label. legacy (bool, optional): if True, uses the original retrofitting code. This is a global option. overwriteVals (bool, optional): if True, allows the entries of the dictionary to be overwritten. Only relevant if doLabel=False, otherwise overwriteVals is to True doLabel (bool, optional): if True, labels the relations as PREFIX@WORD. Previous labels (i.e. REVENGE) are ignored. verbose (bool, optional): if True, prints the names of the lexicons as they are loaded Returns: a dictionary mapping the name of each semantic lexicon to that semantic lexicon ''' if keys == None: keys = filenames else: if len(keys) != len(filenames): raise ValueError('keys must be the same length as filenames') if labeled == None: labeled = np.full(len(filenames), False) else: if len(labeled) != len(filenames): raise ValueError('labeled must be the same length as filenames') lexicons = {} for i in range(len(filenames)): f = filenames[i] if legacy: lex = read_lexicon_legacy(f+'.txt') else: lex = read_lexicon(f+'.txt', labeled=labeled[i], overwriteVals=overwriteVals, doLabel=doLabel) lexicons[keys[i]] = lex if verbose: print('%s loaded' % keys[i]) return lexicons def load_compact_lexicons(filenames, keys=None, verbose=True, sep='\t'): ''' Loads multiple compact retrofitting lexicons as a dictionary. Args: filenames (list of str): list of file names to load keys (list of str, optional): list of dictionary keys verbose (bool, optional): if True, prints the names of the lexicons as they are loaded sep (str, optional): separator for the split operation Returns: a dictionary mapping the name of each semantic lexicon to that semantic lexicon ''' if keys == None: keys = filenames else: if len(keys) != len(filenames): raise ValueError('keys must be the same length as filenames') lexicons = {} for i in range(len(filenames)): f = filenames[i] lex = read_compact_lexicon(f + '.txt', sep=sep) lexicons[keys[i]] = lex if verbose: print('%s loaded' % keys[i]) return lexicons def get_sub_lexicon(lex, words): ''' Returns the subset of the semantic resource which contains the target words. Args: lex (dict from str to list of str): a dictionary mapping each word to its neighbors in a semantic resource words (array-like of str): the target words Returns: the subset of the semantic resource containing the target words ''' # create a smaller version of the semantic resource lex_sub = {k : v for k, v in list(lex.items()) \ if any((True for x in v if x in words))} return lex_sub def check_if_paired(lex, *words, grouped=None): ''' Checks whether a set of words are paired together within a semantic resource. Args: lex (dict from str to list of str): a semantic resource grouped (bool, optional): indicates whether the two words are both in the resource *words: variable length list of words Returns: a boolean, whether or not the words are paired in the same group of words in the semantic resource Examples: >>> isPaired = check_if_paired(lex=lexicons['FrameNet'], 'cat', 'dog') ''' if grouped == False: return False # check if both words are in the resource isPaired = False for k, v in lex.items(): # combine the key with the values items = [get_word_from_label(k)] items.extend(v) # check to see if both words are present if all(x in items for x in words): isPaired = True return isPaired def check_if_grouped(lex, sims, lexName, words=None, getPaired=False): ''' Adds to an existing dataframe, indicating if words are grouped together in a particular semantic resource. Args: lex (dict): a dictionary mapping each word to its neighbors sims (`DataFrame`): a word similarity dataframe lexName (str): name of the semantic resource words (list, optional): list of unique words in the semantic resource. if not supplied, it is generated from the lexicon getPaired (bool, optional): checks whether the two words are paired in the same resource (warning: this may take a long time) Returns: a Pandas DataFrame derived from `sims` with columns indicating whether each pair of words in the semantic resource are present or grouped within the lexicon Examples: >>> df = check_if_grouped(fn, sims, lexName='FrameNet', words=words) ''' # copy the original similarity dataframe df = sims.copy() if words == None: words = get_unique_vals(lex) # check if both words are in the resource col_both = '{}_Both'.format(lexName) df[col_both] = df['Word1'].isin(words) & df['Word2'].isin(words) # restrict attention to the words which appeared in both df_both = df[df[col_both] == True] # get the sub-lexicon for the relevant words simWords = set(df_both['Word1']).union(df_both['Word2']) lex_sub = get_sub_lexicon(lex, simWords) if getPaired: # check if both words are paired in the resource col_paired = '{}_Paired'.format(lexName) paired = df_both.apply(lambda x: check_if_paired(lex_sub, x['Word1'], x['Word2'], grouped=x[col_both]), axis=1) # assign the new column df_both = df_both.assign(**{col_paired : paired}) # merge with the old dataframe df = pd.merge(df, df_both, how='left') # fill the resulting na values with False df = df.fillna(False) return df def check_if_grouped_many(lexicons, evalData, wordData=None, getPaired=False, verbose=True): ''' Adds to the existing dataframe objects in a dictionary, indicating if words are grouped together in a particular semantic resource. Args: lexicons (dict from str to dict): a dictionary mapping each lexicon name to a dictionary mapping each word to its neighbors evalData (dict from str to `DataFrame`): a dictionary mapping each word similarity dataset name to its dataframe wordData (dict, optional): a dictionary mapping each resource name to a list of unique words in the semantic resource. if not supplied, this information is generated from the lexicons getPaired (bool, optional): checks whether the two words are paired in the same resource (warning: this may take a long time) verbose (bool, optional): if True, the names of the lexicon-similarity dataset pairing are printed as they are processed Returns: a dictionary mapping each resource to a Pandas DataFrame indicating whether each pair of words in the semantic resource are present or grouped within the lexicon Examples: >>> evalData = check_if_grouped_many(lexicons, evalData) ... >>> evalData = check_if_grouped_many(lexicons, evalData, wordData) ''' newData = {} # loop through the evaluation data for simName, sims_new in evalData.items(): # loop through the lexicons for lexName, lex in lexicons.items(): # get the words in the lexicon if wordData is not None: words = wordData[lexName] else: words=None sims_new = check_if_grouped(lex=lex, sims=sims_new, lexName=lexName, words=words, getPaired=getPaired) if verbose: msg = '%s-%s is finished' % (simName, lexName) print(msg) # now that the similarity dataset is done, add to the dictionary newData[simName] = sims_new return newData def retrofit_legacy(wordVecs, lexicon, numIters=10): ''' Retrofits word vectors given a semantic lexicon. (the original code from Faruqui et al. 2015) ''' newWordVecs = deepcopy(wordVecs) wvVocab = set(newWordVecs.keys()) loopVocab = wvVocab.intersection(set(lexicon.keys())) for it in range(numIters): # loop through every node also in ontology (else just use data estimate) for word in loopVocab: wordNeighbours = set(lexicon[word]).intersection(wvVocab) numNeighbours = len(wordNeighbours) #no neighbours, pass - use data estimate if numNeighbours == 0: continue # the weight of the data estimate if the number of neighbours newVec = numNeighbours * wordVecs[word] # loop over neighbours and add to new vector (currently with weight 1) for ppWord in wordNeighbours: newVec += newWordVecs[ppWord] newWordVecs[word] = newVec/(2*numNeighbours) return newWordVecs def retrofit(wordVecs, lexicon, num_iters=10, alpha=1.0, beta=1.0, word2vec=False, keep_doubles=False): ''' Retrofits word vectors given a semantic lexicon. Args: wordVecs (list of `Numpy array`): the word vectors to be retrofitted lexicon (dict from str to `Numpy array`): a dictionary mapping each word to its neighbors in a semantic resource num_iters (int, optional): number of iterations alpha (float, optional): controls the strength of the fidelity to the original word vector beta (float, optional): controls the strength of the closeness of a word vector to its neighbors. By default it will be the inverse of the number of neighbors. word2vec (bool, optional): indicates that the vectors are `KeyedVectors` rather than ordinary Python dictionaries keep_doubles (bool, optional): if True, then repeated words are kept. if False, only one instance of each word is kept. Returns: the retrofitted word vectors Examples: >>> glove_retro = retrofit(glove, frameNet, numIters=10) ''' # get the words in the word vectors if word2vec: wvVocab = set(wordVecs.vocab.keys()) newWordVecs = {k : wordVecs[k] for k in wvVocab} else: wvVocab = set(wordVecs.keys()) newWordVecs = deepcopy(wordVecs) # create a dictionary which maps the words to their labels # (in the case of most lexicons, this will map onto the same word) word_to_label = {get_word_from_label(label) : label for label in lexicon.keys()} # get the words from the word vector vocabulary lexVocab = set(word_to_label.keys()) # find the intersection between vocab and lexicon loopVocab = wvVocab.intersection(set(lexVocab)) for it in range(num_iters): # loop through every node also in ontology (else just use data estimate) for word in loopVocab: # use the unique label to get to the neighbors label = word_to_label[word] good = set(lexicon[label]).intersection(wvVocab) # get the words which are in the if keep_doubles: bors = [w for w in lexicon[label] if w in good] else: bors = good # wordNeighbours = set(lexicon[label]).intersection(wvVocab) # numNeighbours = len(wordNeighbours) numBors = len(bors) # use the inverse of beta for algebraic simplicity # betaInv = numNeighbours/beta betaInv = numBors/beta #no neighbours, pass - use data estimate if numBors == 0: continue # the weight of the data estimate if the number of neighbors newVec = alpha * numBors * wordVecs[word] # loop over neighbors and add to new vector for ppWord in bors: newVec += newWordVecs[ppWord] newWordVecs[word] = newVec/(2.0*betaInv) return newWordVecs def retrofit_online(wordVecs, label_to_items, version, numIters=10, alpha=1.0, beta=1.0, word2vec=False): ''' Retrofits word vectors given a dictionary from labels to items comprising a semantic lexicon. Args:wo wordVecs (:obj:`list' of :obj:`Numpy array`): the word vectors to be retrofitted. label_to_items (dict from str to list of str): a dictionary mapping each group label to its items version (float): version of FrameNet numIters (:obj:`int`, optional): number of iterations (default=10) alpha (:obj:`float`, optional): controls the strength of the fidelity to the original word vector (default = 1) beta (:obj:`float`, optional): controls the strength of the closeness of a word vector to its neighbors. By default it will be the inverse of the number of neighbors. (default = 1) word2vec (bool, optional): indicates that the vectors are `KeyedVectors` rather than ordinary Python dictionaries Returns: the retrofitted word vectors Examples: retrofit(glove, label_to_items, numIters=10) ''' if word2vec: wvVocab = set(wordVecs.vocab.keys()) newWordVecs = {k : wordVecs[k] for k in wvVocab} else: wvVocab = set(wordVecs.keys()) newWordVecs = deepcopy(wordVecs) # get all of the unique words in the lexicon # lexWords = set([y for x in label_to_items.values() for y in x]) # loopVocab = wvVocab.intersection(lexWords) for it in range(numIters): # loop through all of the groupings in the ontology for label, items in label_to_items.items(): # preprocess the words post = processAnnoText(items, version) # find all of the words that are in the ontology groupVocab = wvVocab.intersection(post) numNeighbours = len(groupVocab) - 1 for word in groupVocab: wordNeighbours = groupVocab.copy() wordNeighbours.remove(word) # use the inverse of beta for algebraic simplicity betaInv = numNeighbours/beta #no neighbours, pass - use data estimate if numNeighbours == 0: continue # the weight of the data estimate if the number of neighbours newVec = alpha * numNeighbours * wordVecs[word] # loop over neighbours and add to new vector for ppWord in wordNeighbours: newVec += newWordVecs[ppWord] newWordVecs[word] = newVec/(2.0*betaInv) return newWordVecs def retrofit_lexicons(lexicons, vectors, legacy=False, verbose=True, word2vec_key=None, **kwargs): ''' Exhaustively retrofits a dictionary of word vectors given a dictionary of lexicons. Args: lexicons (dict from str to dict): a dictionary mapping the name of each lexicon to that lexicon vectors (dict from str to dict): a dictionary mapping the name of each set of word vectors to those vectors legacy (bool, optional): if True, uses the original retrofitting code verbose (bool, optional): if True, prints the names of the lexicons as they are retrofitted word2vec_key (str, optional): the name of the word2vec vectors kwargs: Additional arguments that get passed to `retrofit` or `retrofit_legacy` Returns: a dictionary mapping the name of each set of word vectors to those word vectors Examples: >>> retroVecs = retrofit_lexicons(lexicons, vectors) >>> retroVecs_legacy = retrofit_lexicons(lexicons, vectors, legacy=True) ''' if word2vec_key == None: word2vec_key = '' retroVecs = {} for lexName, lex in lexicons.items(): retroVecs[lexName] = {} # cycle through the word vectors for vecName, vec in vectors.items(): if vecName == word2vec_key: word2vec = True else: word2vec = False if legacy == False: retroVecs[lexName][vecName] = retrofit(vec, lex, word2vec=word2vec, **kwargs) elif legacy == True: retroVecs[lexName][vecName] = retrofit_legacy(vec, lex, **kwargs) if verbose: print('%s has been retrofitted with %s' % (vecName, lexName)) return retroVecs def retrofit_online_many(filenames, lexNames, lexicons, retroVecs, baseVecs, vecNames, version, verbose=False, sep='\t', **kwargs): ''' Helper function to retrofit multiple lexicons 'online' rather than using standard retrofit lexicons which list each word to its neighbors. Args: filenames (list): list of file paths to the compact retrofitting lexicons lexNames (list): list of names for the retrofitted lexicons (dict from str to dict): a dictionary mapping the name of each lexicon to that lexicon retroVecs (dict from str to dict): a dictionary mapping the name of each set of word vectors to its retrofitted vectors baseVecs (dict from str to dict): a dictionary mapping the name of each set of word vectors to its baseline vectors vecNames (str or list): the name of the vector to retrofit or a list of names of vectors to retrofit version (float): the version of FrameNet used verbose (bool, optional): if True, the names of the lexicons are printed when they are completed kwargs: additional key-worded arguments passed to `retrofit_online` Returns: the updated `retroVecs` and `lexicons` variables ''' if type(vecNames) == str: vecNames = [vecNames] elif type(vecNames) != list: raise ValueError('vecNames must be of type str or list') # loop through the vectors for vecName in vecNames: print('\nStarting vector %s' % vecName) # use a new set of vectors vectors = baseVecs[vecName] # loop through the filenames and lexNames n = len(filenames) for i in range(n): fn = filenames[i] lexName = lexNames[i] # load the compact lexicon label_to_items = read_compact_lexicon(fn + '.txt', sep=sep) results = retrofit_online(vectors, label_to_items, version, **kwargs) # add to the retrofitting dictionary if lexName not in retroVecs: # allows saving the previous results retroVecs[lexName] = {} retroVecs[lexName][vecName] = results # add the dictionary to the list of lexicons lexicons[lexName] = label_to_items if verbose: print('%s is done' % lexName) if verbose: print('Vector %s is done' % vecName) return retroVecs, lexicons ## EVALUATION def cosine_sim(w1, w2, vectors, doLower=False, word2vec=False): ''' Finds the cosine similarity of the vector representations of two words. Args: w1 (str): the first word w2 (str): the second word vectors (dict from str to `np.array`): the word embeddings doLower (bool, optional): if True, lowers the words word2vec (bool, optional): indicates that the vectors are `KeyedVectors` rather than ordinary Python dictionaries Returns: the cosine similarity of the two values ''' if doLower: w1 = w1.lower() w2 = w2.lower() if word2vec: if (w1 not in vectors): return np.nan if (w2 not in vectors): return np.nan else: if (w1 not in vectors.keys()): return np.nan if (w2 not in vectors.keys()): return np.nan return 1 - cosine(vectors[w1], vectors[w2]) def cosine_sim_data(wordSim, vectors, columns=None, label='CosineSim', doLower=False, fillZeros=False, getFrame=False, wordMap=None, charMap=None, word2vec=False): ''' Finds the cosine similarity values across a word similarity dataset. Args: wordSim (`DataFrame`): the word similarity dataset vectors (dict from str to `np.array`): the word embeddings columns (list of str, optional): the column names for the first word and second word label (str, optional): label of the cosine similarity column doLower (bool, optional): if True, lowers the words fillZeros (bool, optional): if True, then any pair which contains an out-of-vocabulary word is set to zero getFrame (bool, optional): if True, returns a new DataFrame with the cosine similarity values, if False, returns the similarity data as an array wordMap (dict, optional): a dictionary mapping word forms in `wordSim` to their corresponding form in `vectors` charMap (dict, optional): if supplied, a mapping to apply to the words before attempting to extract them from the word vectors word2vec (bool, optional): indicates that the vectors are `KeyedVectors` rather than ordinary Python dictionaries Returns: the cosine similarity values across the words in the word similarity dataset, and possibly a new Pandas DataFrame with this information ''' if columns is None: columns = ['Word1', 'Word2'] # get the names of the columns with the words name_one, name_two = columns temp = wordSim[columns] if wordMap is not None: temp = temp.replace(wordMap) if charMap is not None: table = str.maketrans(charMap) for c in columns: temp[c] = temp[c].str.translate(table) sims = temp.apply(lambda x: cosine_sim(x[name_one], x[name_two], vectors, doLower=doLower, word2vec=word2vec), axis=1) if fillZeros: sims = sims.fillna(0) if getFrame: df = wordSim.copy() df[label] = sims return df else: return sims def get_metric(df, metric='spearman', col_gold='Score', col_base='Base', col_retro=None): ''' Gets the performance of a set of word vectors before and after retrofitting base on a given metric. Args: df (`DataFrame`): a DataFrame with the cosine similarity values of word vectors before and after retrofitting. metric (str or callable): if a string, the name of the performance metric. Valid values: 'spearman': Spearmon's rank correlation coefficient [default] 'pearson': Pearson product-moment correlation 'hmean': harmonic mean of the Spearman and Pearson correlations 'mse': mean squared error 'msd': mean squared deviation, an alias of MSE 'rmse': root-mean-square error 'rmsd': root-mean-square deviation, an alias of RMSE 'me': mean of the error 'mae': mean absolute error if a function, the performance metric to be used col_gold (str, optional): name of the column with the gold standard values col_base (str, optional): name of the column with the similarities before retrofitting col_retro (str, optional): name of the column with the similarities after retrofitting Returns: two floats, the performance of the cosine similarity values of the word vectors against the word similarity judgments before and after retrotting Examples: >>> rho = get_metric(df) >>> (rho_base, rho_retro) = get_metric(df, col_retro='Retro') >>> (r_base, r_retro) = get_metric(df, metric='pearson', col_retro='Retro') >>> (rmse_base, rmse_retro) = get_metric(df, metric='rmse') ''' if type(metric) == str: if metric == 'spearman': func = lambda x, y: spearmanr(x, y)[0] elif metric == 'pearson': func = lambda x, y: pearsonr(x, y)[0] elif metric == 'hmean': func = lambda x, y: hmean([spearmanr(x, y)[0], pearsonr(x, y)[0]]) elif metric in ['mse', 'msd']: func = mse elif metric == 'rmse': func = lambda x, y: math.sqrt(mse(x, y)) elif metric == 'me': func = lambda x, y: np.average(y - x) elif metric == 'mae': func = lambda x, y: np.average(np.absolute(y - x)) else: raise ValueError('Function not supported. Feed in '+ 'the function instead?') elif callable(metric): func = metric # x is the prediction, y is the standard metric_base = func(df[col_base], df[col_gold]) if col_retro is not None: metric_retro = func(df[col_retro], df[col_gold]) return metric_base, metric_retro else: return metric_base def load_frames(filenames, keys=None): ''' Loads csv files as a dictionary of Pandas DataFrames. Args: filenames (list of str): list of file names to load keys (list, optional): list of dictionary keys Returns: a dictionary mapping the name of a csv file to its corresponding Pandas DataFrame ''' if keys == None: keys = filenames frames = {} for i in range(len(filenames)): f = filenames[i] d = pd.read_csv(f+'.csv') frames[keys[i]] = d return frames def get_base_df(wordSim, vecs, col_base='Base', col_resid_base='Resid_base', col_score='Score', **kwargs): ''' Returns a DataFrame with the baseline performance of a set of word vectors against a word similarity dataset. Args: wordSim (`DataFrame`): word similarity dataframe vecs (dict from str to `np.array`): a set of word vectors col_base (str, optional): name of the column to store the word vectors' baseline cosine similarity values col_resid_base (str, optional): name of the column to store the residuals from the baseline cosine similarity values col_score (str, optional): name of the column with the human similarity judgment scores kwargs: additional key-worded arguments passed on to `cosine_sim_data` Returns: a Pandas DataFrame with the cosine similarity measures between each pair of words present in the word similarity dataset. Examples: >>> df = get_base_df(sims, vecs) ''' # add color rows df = wordSim.copy() # add base and retro predictions df[col_base] = cosine_sim_data(wordSim, vecs, **kwargs) df[col_resid_base] = df[col_base] - df[col_score] # # get the grouping information # if checkGrouping: # df = check_if_grouped(lex, df, lexName, words, getPaired) return df def get_change_df(wordSim, lexName, baseVecs, retroVecs, col_score='Score', **kwargs): ''' Returns a DataFrame with the performance of word vectors before and after retrofitting with a semantic resource against a word similarity dataset. Args: wordSim (`DataFrame`): word similarity dataframe lexName (str): name of the semantic resource baseVecs (dict from str to `np.array`): the original word vectors retroVecs (dict from str to `np.array`): the retrofitted word vectors col_score (str, optional): name of the column with the human similarity judgment scores kwargs: additional key-worded arguments passed on to `cosine_sim_data` Returns: a Pandas DataFrame with the cosine similarity measures between each pair of words present in the word similarity dataset. Examples: >>> df = get_change_df(sims, 'FrameNet', vectors_base, vectors_retro, fn) >>> df = get_change_df(sims, 'FrameNet', vectors_base, vectors_retro, fn, words) ''' # ''' # lex (dict from str to list of str, optional): a dictionary mapping # each word to its neighbors in a semantic resource # words (list of str, optional): if supplied, indicates whether # both words were present in the semantic lexicon # checkGrouping (bool, optional): if True, adds grouping # information (warning: this may be slow for large datasets) # getPaired (bool, optional): if True, adds pairing information # (warning: this may be slow for large datasets) # ''' # if checkGrouping: # if lex is None: # raise ValueError('lex must be provided if checkGrouping is True') # add rows df = wordSim.copy() # add base and retro predictions col_base = '{}_Base'.format(lexName) col_retro = '{}_Retro'.format(lexName) df[col_base] = cosine_sim_data(wordSim, baseVecs, **kwargs) df[col_retro] = cosine_sim_data(wordSim, retroVecs, **kwargs) # add residuals col_resid_base = '{}_Resid_base'.format(lexName) col_resid_retro = '{}_Resid_retro'.format(lexName) col_err_change = '{}_Err_change'.format(lexName) df[col_resid_base] = df[col_base] - df[col_score] df[col_resid_retro] = df[col_retro] - df[col_score] df[col_err_change] = abs(df[col_resid_retro]) - abs(df[col_resid_base]) # add whether the change in residual from the baseline col_change = '{}_Change'.format(lexName) df[col_change] = ['None' if v == 0 else 'Better' if v < 0 \ else 'Worse' for v in df[col_err_change].values] # # get the grouping information # if checkGrouping: # df = check_if_grouped(lex, df, lexName, words, getPaired) return df def get_change_data(evalData, baseVecs, retroVecs, lexicons, wordMaps=None, charMaps=None, lowerMap=None, word2vec_key=None, **kwargs): ''' Returns a dictionary indicating the trial-level performance of many word vectors retrofitting against many word similarity datasets. Args: evalData (dict from str to dict): a dictionary containing the word similarity datasets baseVecs (dict from str to dict): a dictionary containing the word vectors prior to retrofitting retroVecs (dict from str to dict): a dictionary containing the word vectors after retrofitting lexicons (dict from str to dict): a dictionary mapping each name to the associated semantic resource wordMaps (dict from str to dict,, optional): a dictionary mapping vector names to dictionaries which map word forms in the similarity datasets to their corresponding vectors charMaps (dict from str to dict, optional): a dictionary mapping vector names to a mapping to apply to the words before attempting to extract them from the word vectors lowerMap (dict from str to bool, optional): a dictionary mapping vector names to whether or not the words should be lowered before the evaluation word2vec_key (str, optional): the name of the word2vec vectors kwargs: additional arguments passed to `get_change_df` Returns: a dictionary containing the trial-level performance of the word vectors against the word similarity datasets Examples: >>> changeData = get_change_data(evalData, baseVecs, retroVecs, lexicons) >>> changeData = get_change_data(evalData, baseVecs, retroVecs, lexicons, wordData) >>> changeData = get_change_data(evalData, baseVecs, retroVecs, lexicons, wordData, checkGrouping=True) ''' # if 'checkGrouping' in kwargs: # raise ValueError('use of checkGrouping in `get_change_data` ' + # 'no longer supported. please load the paired evalData') # if lexicons is None: # # expect that the evalData contains the lexicon information # tmp = list(evalData.values())[0].columns # (rathery hacky) # lexNames = tmp.str.extract('(.*)_Both', expand=True).dropna()[0].tolist() # lexicons = {lexName : None for lexName in lexNames} changeData = {} # cycle through the word vectors for vecName, vectors_base in baseVecs.items(): if vecName == word2vec_key: word2vec = True else: word2vec = False if wordMaps is not None: if vecName in wordMaps: vecWordMap = wordMaps[vecName] else: vecWordMap = None else: vecWordMap = None if charMaps is not None: if vecName in charMaps: vecCharMap = charMaps[vecName] else: vecCharMap = None else: vecCharMap = None if lowerMap is not None: if vecName in lowerMap: lower = lowerMap[vecName] else: lower = False else: lower = False changeData[vecName] = {} # cycle through the human data for simName, sims in evalData.items(): sims_vec = sims.copy() # cycle through the lexicons for lexName, lex in lexicons.items(): # get the retrofitted vectors vectors_retro = retroVecs[lexName][vecName] sims_vec = get_change_df(wordSim=sims_vec, lexName=lexName, baseVecs=vectors_base, retroVecs=vectors_retro, wordMap=vecWordMap, charMap=vecCharMap, doLower=lower, word2vec=word2vec, **kwargs) print('Finished with {}-{}-{}'.format(simName, vecName, lexName)) # once finished looping through the lexicons, store the result changeData[vecName][simName] = sims_vec return changeData def get_change_subset(df, lexName): ''' Returns the subset of a change dataframe to use to plot results. Args: df (:obj:`Pandas DataFrame`): a DataFrame with columns prefixed by LEXICON_[Measure] lexName (str): the name of the semantic resource Returns: the section of the table dealing with information pertaining to the semantic resource, with the prefix LEXICON_ removed. Examples: >>> changeData = get_change_data(...) >>> df_fn = get_change_subset(changeData['GloVe']['MEN3K'], 'FN_v17') ''' # copy the dataframe df_lex = df.copy() # create the regex command pat = '{}_'.format(lexName) # find the relavant columns lex_cols = df_lex.columns[df_lex.columns.str.contains(pat)] # add to the other columns (this is messy) # TODO: make sure that this isn't hard-coded lex_cols = pd.Index(['Word1', 'Word2', 'Score']).append(lex_cols) # get the subset df_lex = df_lex[lex_cols] # replace the lexicon prefix with an empty string df_lex.columns = df_lex.columns.str.replace(pat, '') return df_lex # `get_metric` moved to `vector_utils.py` def get_metric_data(data, metric='spearman', base=False, subset='all', family=None, vecName=None, simName=None, **kwargs): ''' Gets the metric values before and possibly after retrofitting. Args: data (dict or `DataFrame`): a dictionary containing the trial-level performance of the word vectors against the word similarity datasets, or a Pandas DataFrame containing the trial-level performance for a particular set of word vectors evaluated against a particular human similarity dataset metric (str or function): if a string, the name of the performance metric. Valid values: 'spearman', 'pearson', 'mse' and 'rmse' if a function, the performance metric to be used base (bool, optional): if True, then only the baseline metrics are returned subset (`str`, optional): if 'all', ises all of the points, if 'both', uses only the points which were affected by retrofitting, if 'paired', uses only the points which were neighbors in the semantic resource Values: 'all', 'both', 'paired', 'better', 'worse', 'none' family (`str`, optional): if supplied, the name of the lexicon family to be added as a column name. Useful with calls to `retrofit_iter`. vecName (str, optional): the name or a list of names of the word vectors. simName (str, optional): the name or a list of the human similarity dataset names. kwargs: Additional arguments that get passed to `get_metric` Returns: a Pandas DataFrame containing the root-mean squared error values between the the cosine similarity values of the word vectors and the word similarity judgments before and after retrotting Examples: >>> metricData = get_metric_data(changeData, metric='spearman') ... >>> metricData = get_metric_data(changeData, metric='rmse') ''' if type(data) == pd.DataFrame: if vecName is None: print('vecName must be specified if changeData is of type DataFrame') if (base == False) & (simName is None): print('simName must be specified if changeData is of type DataFrame') # create the lists for the loops vecNames = [vecName] simNames = [simName] # change changeData to fit the schema below tmp = data.copy() data = {} data[vecName] = {} if base == False: data[vecName][simName] = tmp elif type(data) == dict: if vecName is not None: if type(vecName) == str: vecNames = [vecName] elif type(vecName) == list: vecNames = vecName else: raise ValueError('vecName must be of type str or list') else: # get the names of the word vectors vecNames = list(data.keys()) if (base == False) & (simName is not None): if type(simName) == str: simNames = [simName] elif type(simName) == list: simNames = simName else: raise ValueError('simName must be of type str or list') else: # get the names of the similarity datasets simNames = list(data[vecNames[0]].keys()) # assumes symmetry in the sim data # create a dictionary with the information metricData = {} metricData['Human Judgment'] = [] metricData['Word vector'] = [] metricData['Base'] = [] if base == False: metricData['Lexicon'] = [] metricData['Retro'] = [] # get the columns of the change dataframe if base == False: columns = data[vecNames[0]][simNames[0]].columns # extract the lexical resource names lexNames = columns.str.extract('(.*)_Retro', expand=False).dropna().values # cycle through the word vectors for vecName in vecNames: # cycle through the human data for simName in simNames: if base == False: # cycle through the lexicons for lexName in lexNames: metricData['Human Judgment'].append(simName) metricData['Lexicon'].append(lexName) metricData['Word vector'].append(vecName) # get the subset df = get_change_subset(data[vecName][simName], lexName) if subset == 'all': df_sub = df elif subset in ['both', 'paired']: df_sub = df.loc[df[subset.capitalize()] == True] elif subset in ['better', 'worse', 'none']: df_sub = df.loc[df['Change'] == subset.capitalize()] else: raise ValueError('subset value "%s" not supported' % subset) (metric_base, metric_retro) = get_metric(df_sub, metric=metric, col_retro='Retro', **kwargs) metricData['Base'].append(metric_base) metricData['Retro'].append(metric_retro) else: metricData['Human Judgment'].append(simName) metricData['Word vector'].append(vecName) df = data[vecName][simName] metric_base = get_metric(df, metric=metric, **kwargs) metricData['Base'].append(metric_base) # turn this into a Pandas DataFrame metricData = pd.DataFrame(metricData) if base == False: # add a change column metricData['Change'] = metricData['Retro'] - metricData['Base'] # add the family name if family != None: metricData = metricData.assign(Family=family) return metricData def get_base_data(evalData, baseVecs, wordMaps=None, charMaps=None, lowerMap=None, word2vec_key=None, **kwargs): ''' Returns a dictionary indicating the trial-level performance of many word vectors against many word similarity datasets. Args: evalData (dict from str to dict): a dictionary containing the word similarity datasets baseVecs (dict from str to dict): a dictionary containing the word vectors prior to retrofitting wordMaps (dict from str to dict,, optional): a dictionary mapping vector names to dictionaries which map word forms in the similarity datasets to their corresponding vectors charMaps (dict from str to dict, optional): a dictionary mapping vector names to a mapping to apply to the words before attempting to extract them from the word vectors lowerMap (dict from str to bool, optional): a dictionary mapping vector names to whether or not the words should be lowered before the evaluation word2vec_key (str, optional): the name of the word2vec vectors kwargs: additional arguments passed to `get_base_df` Returns: a dictionary containing the trial-level performance of the word vectors against the word similarity datasets Examples: >>> baseData = get_base_data(evalData, baseVecs) ''' if word2vec_key == None: word2vec_key = '' baseData = {} # cycle through the word vectors for vecName, vectors_base in baseVecs.items(): if vecName == word2vec_key: word2vec = True else: word2vec = False if wordMaps is not None: if vecName in wordMaps: vecWordMap = wordMaps[vecName] else: vecWordMap = None else: vecWordMap = None if charMaps is not None: if vecName in charMaps: vecCharMap = charMaps[vecName] else: vecCharMap = None else: vecCharMap = None if lowerMap is not None: if vecName in lowerMap: lower = lowerMap[vecName] else: lower = False else: lower = False baseData[vecName] = {} # cycle through the human data for simName, sims in evalData.items(): sims_vec = sims.copy() sims_vec = get_base_df(wordSim=sims_vec, vecs=vectors_base, doLower=lower, wordMap=vecWordMap, charMap=vecCharMap, word2vec=word2vec, **kwargs) print('Finished with {}-{}'.format(simName, vecName)) # once finished looping through the lexicons, store the result baseData[vecName][simName] = sims_vec return baseData ## LOAD RESULTS def load_change_data(path, post=None): ''' Loads a directory as a dictionary containing the trial-level performance of the word vectors against the word similarity datasets. This functions as the reverse of `save_change_data`, but it simply loads all of the files with the compliant format into a single dictionary. ''' files = os.listdir(path) files = [f for f in files if f.endswith('.csv')] changeData = {} for f in files: items = f.split('_') if len(items) == 3: vecName, simName, postName = items if post is None: # discard the "postname" d = pd.read_csv(path + f) if vecName not in changeData: changeData[vecName] = {} changeData[vecName][simName] = d else: if post == postName[:-4]: d = pd.read_csv(path + f) if vecName not in changeData: changeData[vecName] = {} changeData[vecName][simName] = d else: continue # if post is None: # if len(items) == 3: # # discard the "postname" # vecName, simName, _ = items # else: # continue # d = pd.read_csv(path + f) # else: # raise NotImplementedError('loading by postfix not implemented yet') # # load the data # if vecName not in changeData: # changeData[vecName] = {} # changeData[vecName][simName] = d return changeData ## SAVING def save_change_data(path, changeData, post=None): ''' Saves a dictionary containing the trial-level performance of the word vectors against the word similarity datasets, as computed via `get_change_data`. `post` is a post-fix to distinguish results. ''' for vecName, vecDict in changeData.items(): for simName, d in vecDict.items(): if post is not None: filename = '%s_%s_%s.csv' % (vecName, simName, post) else: filename = '%s_%s.csv' % (vecName, simName) d.to_csv(path + filename, index=False) ## PLOTTING def get_plot_metrics(df, metrics, df_plot=None, showChange=False): ''' Returns the metrics before and after retrofitting. Args: df (`DataFrame`): an evaluation dataframe containing the human scores, and the base and retrofitted values from the word vectors metrics (str, callable or list, optional): a string, a function, or a list of strings or functions to be passed on to `get_metric` to be used as performance metrics df_plot (`DataFrame`, optional): the dataframe used in `plot_results`. if supplied, the metrics are added to `df_plot` under the "Type" column, otherwise the metrics are returned as strings showChange (bool, optional): if True, the change in metric is computed rather than the raw values before and after Returns: if `df_plot` is supplied, the dataframe with the metrics added, otherwise two strings indicating the metrics (intended to be used as labels in a plot) ''' # copy the original dataframe if df_plot is not None: df_plot = df_plot.copy() # create a metric symbol dictionary (temp) metric_to_symbol = {'spearman':'rho', 'pearson':'r', 'rmse':'RMSE'} def get_one_metric(df, metric): '''another utility function, to avoid repeating in the loop''' val = get_metric(df, metric=metric, col_gold='Score', col_base='Base', col_retro='Retro') val_base = np.round(val[0], 3) val_retro = np.round(val[1], 3) return val_base, val_retro # if a string or function, simply run as normal if type(metrics) == str or callable(metrics): val_base, val_retro = get_one_metric(df, metrics) if metrics in metric_to_symbol: symbol = metric_to_symbol[metrics] else: symbol = 'metric' pat_retro = 'Retro ({}=%.3f)'.format(symbol) if showChange: # get the change val_change = val_retro - val_base text_base = 'Base' text_retro = pat_retro % val_change else: pat_base = 'Base ({}=%.3f)'.format(symbol) text_base = pat_base % val_base text_retro = pat_retro % val_retro elif type(metrics) == list: if showChange: text_base = 'Base' else: text_base = 'Base ({})' text_retro = 'Retro ({})' rep_base = '' rep_retro = '' for m in metrics: val_base, val_retro = get_one_metric(df, m) if m in metric_to_symbol: symbol = metric_to_symbol[m] else: symbol = 'metric' # add to the text if showChange: val_change = val_retro - val_base rep_retro += ('{}=%.3f, ' % val_change).format(symbol) else: rep_base += ('{}=%.3f, ' % val_base).format(symbol) rep_retro += ('{}=%.3f, ' % val_retro).format(symbol) # strip the extra comma and space if not showChange: rep_base = rep_base.rstrip(', ') rep_retro = rep_retro.rstrip(', ') # put into text_base and text_retro text_base = text_base.format(rep_base) text_retro = text_retro.format(rep_retro) if df_plot is not None: # update the df_plot values df_plot['Type'] = df_plot['Type'].map({'Base' : text_base, 'Retro' : text_retro}) return df_plot else: return text_base, text_retro def plot_results(data, lexName, vecName=None, simName=None, title=None, xlabel=None, ylabel=None, alpha=.7, s=75, tsize=14, xsize=14, ysize=14, legsize=12, legend_out=True, plotDiagonal=True, subset='all', showMetrics=True, showChange=False, metrics='spearman', filename=None, colors=None, **kwargs): ''' Plots human similarity judgment against the word vector cosine similarity values before and after retrofitting. Args: data (dict or `DataFrame`): a dictionary containing the retrofitting results, or aPandas DataFrame containing the results of a single resource in retrofitting lexName (str): name of the semantic resource vecName (str, optional): name of the word vectors simName (str, optional): name of the similarity dataset xlabel (str, optional): x-label of the figure ylabel (str, optional): y-label of the figure title (str, optional): the title of the figure alpha (float, optional): controls the transparency of the marker s (int, optional): controls the size of the marker tsize (int): font size for the titles xsize (int): font size for the x labels ysize (int): font size for the y labels legsize (int): font size for the legend labels plotDiagonal (bool, optional): if True, plots the diagonal legend_out (bool, optional): if True, the legend is put outside the figure subset (str, optional): if 'all', plots all of the points, if 'both', plots only the points which were affected by retrofitting, if 'paired', plots only the points which were neighbors in the semantic resource Values: 'all', 'both', 'paired', 'better', 'worse', 'none' showMetrics (bool, optional): if True, shows the metric(s) on each of the figures showChange (bool, optional): if True, shows the change in the metric(s) on the retrofitted figure metrics (str, callable or list, optional): a string, a function, or a list of strings or functions to be passed on to `get_metric` to be used as performance metrics filename (str, optional): name of file path to save the resulting figure colors (list, optional): name of xkcd color names for the results. Default: ['blue', 'emerald green', 'red'] kwargs: additional arguments passed to `sns.FacetGrid` Returns: the FacetGrid object Examples: >>> df = changeData['GloVe']['MEN3K'] >>> plot_results(df, 'FrameNet', title='Lexicon : Data : Vector') ''' if type(data) == dict: if (vecName is None) | (simName is None): msg = 'if data is a dictionary then both vecName and simName must be specified' raise ValueError(msg) df = get_change_subset(data[vecName][simName], lexName) if xlabel is None: xlabel = vecName + ' cosine similarity' if ylabel is None: ylabel = 'Judgments from ' + simName elif type(data) == pd.DataFrame: # get the subset of the dataframe for the lexicon df = get_change_subset(data, lexName) if xlabel is None: xlabel = 'Word Vector' elif ylabel is None: ylabel = 'Human Judgment' else: raise ValueError('data must be either a dictionary of a DataFrame') if subset in ['all', 'better', 'worse']: id_vars = ['Word1', 'Word2', 'Score', 'Err_change', 'Change'] elif subset == 'both': id_vars = ['Word1', 'Word2', 'Score', 'Err_change', 'Change', \ 'Both'] elif subset == 'paired': id_vars = ['Word1', 'Word2', 'Score', 'Err_change', 'Change', \ 'Both', 'Paired'] # convert the dataframe to the form necessary for plotting df_plot = df.melt(id_vars=id_vars, \ value_vars=['Base', 'Retro'], var_name = 'Type', value_name= 'Vector') if colors is None: colors = ['blue', 'emerald green', 'red'] color_vals = sns.xkcd_palette(colors) # create the custom palette pal = {'None':color_vals[0], 'Better':color_vals[1], 'Worse':color_vals[2]} if title is None: title = lexName if subset == 'all': if showMetrics: df_plot = get_plot_metrics(df, metrics, df_plot, showChange=showChange) # hack hue_order = df['Change'].value_counts().index.values if 'None' in hue_order: hue_order = ['None', 'Better', 'Worse'] else: hue_order = ['Better', 'Worse'] g = sns.FacetGrid(df_plot, col='Type', hue='Change', legend_out=legend_out, \ palette=pal, hue_order=hue_order, **kwargs) elif subset in ['both', 'paired']: df_sub = df.loc[df[subset.capitalize()] == True] df_plot = df_plot.loc[df_plot[subset.capitalize()] == True] if showMetrics: df_plot = get_plot_metrics(df_sub, metrics, df_plot, showChange=showChange) g = sns.FacetGrid(df_plot, col='Type', hue="Change", legend_out=legend_out, \ palette=pal, hue_order = ['Better', 'Worse'], **kwargs) elif subset in ['better', 'worse', 'none']: df_sub = df.loc[df['Change'] == subset.capitalize()] df_plot = df_plot.loc[df_plot['Change'] == subset.capitalize()] if showMetrics: df_plot = get_plot_metrics(df_sub, metrics, df_plot, showChange=showChange) g = sns.FacetGrid(df_plot, col='Type', hue="Change", legend_out=legend_out, \ palette=pal, **kwargs) else: raise ValueError('subset value not recognized') g = g.map(plt.scatter, "Vector", "Score", alpha=alpha, s=s) g.set_titles("{col_name}", size=tsize, weight='bold') leg = g.add_legend(prop={'weight':'bold', 'size':legsize}) new_title = 'Change' # set the title if legend_out: leg = g._legend else: leg = g.facet_axis(0, 0).get_legend() leg.set_title(new_title) plt.setp(leg.get_title(), **{'weight':'bold', 'size':str(tsize)}) g.fig.suptitle(title, size=tsize, weight = 'bold') g.fig.subplots_adjust(top=.9) # offset g.set_xlabels(xlabel, size=xsize, weight='bold') g.set_ylabels(ylabel, size=ysize, weight='bold') if plotDiagonal: # plot the diagonal for ax in g.axes.ravel(): ax.plot([0, 1], [0, 1], color = 'black', ls = '--') if filename is not None: g.savefig(filename) return g def plot_metrics(data, title= '', ylabel='', subset='all', metric='spearman', invert=False, base=False, text_size=14, leg_labels_dict=None, leg_title_weight='bold', leg_title_size=14, leg_texts_size=13, vecName=None, simName=None, height=6, aspect=0.7, filename=None, **kwargs): ''' Plots a DataFrame using some metric to compare human similarity judgment against the word vector cosine similarity values before and after retrofitting. Args: data (dict from str to dict): a dictionary containing the trial-level performance of the word vectors against the word similarity datasets title (str, optional): the title of the figure ylabel (str, optional): the label of the y-axis subset (str, optional): if 'all', plots all of the points, if 'both', plots only the points which were affected by retrofitting, if 'paired', plots only the points which were neighbors in the semantic resource Values: 'all', 'both', 'paired', 'better', 'worse', 'none' metric (str or callable): a string or function to be passed on to `get_metric` to be used as the performance metric invert (bool, optional): if True, the y-axis is inverted base (bool, optional): if True, the baseline performance of the vectors is plotted text_size (int): font size of the axes and title leg_labels_dict (dict, optional): if specified, a dictionary mapping the old labels to the new labels leg_title_weight (str, optional): weight of legend title leg_title_size (str or int, optional): font size of the legend title leg_texts_size (str or int, optional): font size of the legend texts vecName (str, optional): the name or a list of names of the word vectors. simName (str, optional): the name or a list of the human similarity dataset names. height (int, optional): the height of the plot aspect (float, optional): the aspect of the plot filename (str, optional): file path to save the resulting figure kwargs: additional key-worded arguments passed to `seaborn.catplot` Returns: the axis of the plot Examples: >>> changeData = get_change_data(...) >>> plot_metrics(changeData, metric='spearman', 'Change in Spearman Correlation') >>> plot_metrics(changeData, metric='pearson', 'Change in Pearson Correlation') >>> plot_metrics(changeData, metric='rmse', 'Change in RMSE') >>> baseData = get_base_data(...) >>> plot_metrics(baseData) ''' # get the proper subset df_sub = get_metric_data(data, metric=metric, base=base, subset=subset, vecName=vecName, simName=simName) # restrict to the labels present, if any if leg_labels_dict is not None: df_sub = df_sub[df_sub['Lexicon'].isin(leg_labels_dict.keys())] # with sns.plotting_context("notebook", font_scale=1.5): if base == False: g = sns.catplot(x='Word vector', y='Change', hue='Lexicon', col='Human Judgment', data=df_sub, kind="bar", height=height, aspect=aspect, legend_out=True, **kwargs) else: g = sns.catplot(x='Word vector', y='Base', col='Human Judgment', data=df_sub, kind="bar", height=height, aspect=aspect, legend_out=True, **kwargs) # override font weight g.set_titles("{col_name}", size=text_size, weight='bold') g.set_xlabels('Word vector', size=text_size, fontweight='bold') g.set_ylabels(ylabel, size=text_size, fontweight='bold') g.fig.suptitle(title, size=text_size, weight = 'bold') g.fig.subplots_adjust(top=.9) # offset if invert: plt.gca().invert_yaxis() # deal with the legend leg = g._legend if leg is not None: # annoying way to change this plt.setp(leg.get_title(), **{'weight':leg_title_weight, 'size':leg_title_size}) plt.setp(leg.get_texts(), fontsize=leg_texts_size) # replace labels if leg_labels_dict is not None: for t in leg.texts: t.set_text(leg_labels_dict[t.get_text()]) if filename is not None: g.savefig(filename) return g def annotate_results(data, lexName, g, w1, w2, vecName=None, simName=None, show_points=False, show_labels=False, markers=None, ms=15): ''' Annotates the figure resulting from `plot_results`, which shows the effects of retrofitting. Args: data (dict or `DataFrame`): a dictionary containing the retrofitting results, or aPandas DataFrame containing the results of a single resource in retrofitting lexName (str): name of the semantic resource g (`seaborn.axisgrid.FacetGrid`): FacetGrid resulting from `plot_results` w1 (str or list): the first word in the word pair, or a list of first words w2 (str or list): the second word in the word pair, or a list of second words vecName (str, optional): name of the word vectors simName (str, optional): name of the similarity dataset show_points (bool, optional): if True, plots the annotated points in black show_labels (bool, optional): if True, shows the word-pair labels markers (list, optional): list of markers for the points ms (int): marker size Returns: a Seaborn FacetGrid ''' if type(data) == dict: if (vecName is None) | (simName is None): msg = 'if data is a dictionary then both vecName and simName must be specified' raise ValueError(msg) df = get_change_subset(data[vecName][simName], lexName) elif type(data) == pd.DataFrame: # get the subset of the dataframe for the lexicon df = get_change_subset(data, lexName) else: raise ValueError('data must be either a dictionary of a DataFrame') if type(w1) == str: if type(w2) != str: raise ValueError('w1 and w2 must be of the same type') w1 = [w1] w2 = [w2] elif type(w1) == list: if type(w2) != list: raise ValueError('w1 and w2 must be of the same type') if len(w1) != len(w2): raise ValueError('w1 and w2 must have the same length') else: raise ValueError('w1 and w2 must be either strings or a list of strings') df = df.assign(**{'Label': ['%s &\n%s' % (x, y) for (x, y) in zip(df['Word1'], df['Word2'])]}) # get the critical pairs labs = ['%s &\n%s' % (x, y) for (x, y) in zip(w1, w2)] sub = df[df['Label'].isin(labs)] # get the indices (hack) inds = [df.index[df['Label'] == l][0] for l in labs] sub = sub.loc[inds] # get the markers if markers is None: markers = ['o'] * len(sub) # get the x, y values and plot them for base, retro, score, lab, m in zip(sub['Base'], sub['Retro'], sub['Score'], labs, markers): if show_labels: g.facet_axis(0, 0).annotate(lab, xy=(base - 0.1, score + 0.05), fontsize=20, fontweight='bold') g.facet_axis(0, 1).annotate(lab, xy=(retro - 0.1, score + 0.05), fontsize=20, fontweight='bold') if show_points: # debug for visuals cmd = 'k' + m g.facet_axis(0, 0).plot(base, score, cmd, ms=ms) g.facet_axis(0, 1).plot(retro, score, cmd, ms=ms) return g if __name__ == '__main__': RERUN = True # boolean indicating if the analysis should be recomputed sns.set_style('darkgrid') version = 1.71 # version of FrameNet to use (compatability with other code) print('Using version %s' % version) # constants # retrofitting lexicons # code to generate the retrofitting lexicons is not present in this file lex_names_main = ['ppdb-xl', 'wordnet_plus', 'framenet_lus'] lex_keys_main = ['PPDB', 'WN-PLUS', 'FN'] labeled_main = [False, False, False] # lex_names_anno = ['framenet-live-fe-nouns'] # lex_names_anno = ['fn_retro_nouns_first', 'fn_retro_nouns_last', 'fn_retro_nouns_one', # 'fn_retro_nouns_all'] lex_names_anno = ['fn_retro_nouns_last'] lex_keys_anno = ['FN-ANNO'] # lex_keys_anno = ['FN-ANNO-FIRST', 'FN-ANNO-LAST', 'FN-ANNO-ONES', 'FN-ANNO-ALL'] lex_names_main = [PATH_LEX + l for l in lex_names_main] lex_names_anno = [PATH_LEX + l for l in lex_names_anno] # leg_labels_dict = {'PPDB': 'PPDB', 'WN-PLUS': 'WN+', 'FN': 'FN', 'FN-NOUN': 'FN-ANNO'} leg_labels_dict = {'PPDB': 'PPDB', 'WN-PLUS': 'WN+', 'FN': 'FN', 'FN-ANNO-FIRST': 'FN-ANNO-FIRST', 'FN-ANNO-LAST': 'FN-ANNO-LAST', 'FN-ANNO-ONES': 'FN-ANNO-ONES', 'FN-ANNO-ALL': 'FN-ANNO-ALL', 'FN-ANNO': 'FN-ANNO'} vec_keys = ['NB', 'GloVe', 'SG'] # vec_keys = ['GloVe', 'SG'] # XXX: "EVAL" files were cached vectors to speed up the loading process vec_names = [PATH_EVAL_DICT[v] for v in vec_keys] # XXX: to use the full word vector files (not included), # the files must be added to the folder "vectors". Line 46 needs to be # uncommented so that the code knows SG refers to KeyedVectors # vec_names = [PATH_FULL_DICT[v] for v in vec_keys] sim_keys = ['MT771', 'MEN3K', 'RW', 'SL999'] # note: these files are labeled "PAIRED" because they contain cached # information sim_names = ['MT771_PAIRED', 'MEN3K_PAIRED', 'RW_PAIRED', 'SL999_PAIRED'] sim_names = [PATH_SIM + s for s in sim_names] # load the vectors, lexicons and similarity sets # baseVecs = load_word_vecs(vec_names, vec_keys, word2vec_key=WORD2VEC_KEY) baseVecs = load_word_vecs(vec_names, vec_keys, word2vec_key='') lexicons = load_lexicons(lex_names_main, labeled_main, lex_keys_main) # lexicons_anno = load_compact_lexicons(lex_names_anno, keys=lex_keys_anno, sep=' ') evalData = load_frames(sim_names, sim_keys) baseData = get_base_data(evalData, baseVecs, wordMaps=WORD_MAP_DICT, charMaps=CHAR_MAP_DICT, lowerMap=VEC_LOWER_DICT, fillZeros=True, word2vec_key='') # get the base data base_hmean = get_metric_data(baseData, metric='hmean', base=True, subset='all') base_rmse = get_metric_data(baseData, metric='rmse', base=True, subset='all') # TO RERUN THE ANALYSIS FROM SCRACH if RERUN: retroVecs = retrofit_lexicons(lexicons, baseVecs, word2vec_key='') retroVecs, lexicons = retrofit_online_many(lex_names_anno, lex_keys_anno, lexicons, retroVecs, baseVecs, vec_keys, version, verbose=True, sep=' ') # add the grouping information evalData = check_if_grouped_many(lexicons, evalData, getPaired=False) changeData = get_change_data(evalData, baseVecs, retroVecs, lexicons, wordMaps=WORD_MAP_DICT, charMaps=CHAR_MAP_DICT, lowerMap=VEC_LOWER_DICT, fillZeros=True) else: changeData = load_change_data('results/', post='0403B') # # FIG 1: Spearman, all pairs # plot_metrics(changeData, title='Change in Spearman after retrofitting (all pairs)', # vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], # subset='all', metric='spearman', invert=False, leg_labels_dict=leg_labels_dict, # text_size=18, leg_texts_size=18, leg_title_size=18, height=5, # ylabel='Change in Spearman correlation') # # FIG 1B: Pearson, all pairs # plot_metrics(changeData, title='Change in Pearson after retrofitting (all pairs)', # vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], # subset='all', metric='pearson', invert=False, leg_labels_dict=leg_labels_dict, # text_size=18, leg_texts_size=18, leg_title_size=18, height=5, # ylabel='Change in Pearson correlation') # FIG 1C: Harmonic Mean, all pairs plot_metrics(changeData, title='Change in correlation after retrofitting (all pairs)', vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], subset='all', metric='hmean', invert=False, leg_labels_dict=leg_labels_dict, text_size=18, leg_texts_size=18, leg_title_size=18, height=5, ylabel='Change in harmonic mean of correlations') # FIG 2: RMSE, all pairs plot_metrics(changeData, title='Change in RMSE after retrofitting (all pairs)', vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], subset='all', metric='rmse', invert=True, leg_labels_dict=leg_labels_dict, text_size=18, leg_texts_size=18, leg_title_size=18, height=5, ylabel='Change in RMSE') # # FIG 3: Spearman, pairs in resource # plot_metrics(changeData, title='Change in Spearman after retrofitting (pairs in resource)', # vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], # subset='both', metric='spearman', invert=False, leg_labels_dict=leg_labels_dict, # text_size=18, leg_texts_size=18, leg_title_size=18, height=5, # ylabel='Change in Spearman correlation') # # # FIG 3B: Pearson, pairs in resource # plot_metrics(changeData, title='Change in Pearson after retrofitting (pairs in resource)', # vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], # subset='both', metric='pearson', invert=False, leg_labels_dict=leg_labels_dict, # text_size=18, leg_texts_size=18, leg_title_size=18, height=5, # ylabel='Change in Pearson correlation') # # FIG 3C: Harmonic Mean, pairs in resource plot_metrics(changeData, title='Change in correlation after retrofitting (pairs in resource)', vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], subset='both', metric='hmean', invert=False, leg_labels_dict=leg_labels_dict, text_size=18, leg_texts_size=18, leg_title_size=18, height=5, ylabel='Change in harmonic mean of correlations') # FIG 4: RMSE, pairs in resource plot_metrics(changeData, title='Change in RMSE after retrofitting (pairs in resource)', vecName=['GloVe', 'SG'], simName=['MT771', 'MEN3K', 'RW', 'SL999'], subset='both', metric='rmse', invert=True, leg_labels_dict=leg_labels_dict, text_size=18, leg_texts_size=18, leg_title_size=18, height=5, ylabel='Change in RMSE') # FIG 5: FN-ANNO, all pairs g = plot_results(changeData, 'FN-ANNO', 'GloVe', 'MT771', metrics=['spearman', 'rmse'], subset='all', title='FN-ANNO (all pairs)', s=75, legend_out=False, xsize=20, ysize=22, tsize=20, showChange=True) w1 = ['find', 'occasion', 'film'] w2 = ['occurrence', 'second', 'movie'] annotate_results(changeData, 'FN-ANNO', g, w1, w2, 'GloVe', 'MT771', show_points=True, markers=['D', 'X', 's'], ms=18) # FIG 5B: FN-ANNO, pairs in resource g = plot_results(changeData, 'FN-ANNO', 'GloVe', 'MT771', metrics=['spearman', 'rmse'], subset='both', title='FN-ANNO (pairs in resource)', s=75, legend_out=False, xsize=20, ysize=22, tsize=20, showChange=True) w1 = ['find', 'occasion', 'film'] w2 = ['occurrence', 'second', 'movie'] annotate_results(changeData, 'FN-ANNO', g, w1, w2, 'GloVe', 'MT771', show_points=True, markers=['D', 'X', 's'], ms=18)