user_df = pd.DataFrame.from_dict(user_calibrations, orient='index') user_df article_df = pd.DataFrame.from_dict(article_calibrations, orient='index') article_df for name, bipartite_dict in bipartite_dicts.iteritems(): M, text_dict, user_dict = viz_bipartite(bipartite_dict, name) np.save('geniusdata/'+name+'/M.npy', M) json.dump(text_dict, open('geniusdata/'+name+'/text_dict.json', 'w')) json.dump(user_dict, open('geniusdata/'+name+'/user_dict.json', 'w')) text_exogenous_ranks = make_exogenous_ranks(text_dict, all_text_exogenous) user_exogenous_ranks = make_exogenous_ranks(user_dict, all_user_exogenous) json.dump(text_exogenous_ranks, open('geniusdata/'+name+'/text_exogenous_ranks.json', 'w')) json.dump(user_exogenous_ranks, open('geniusdata/'+name+'/user_exogenous_ranks.json', 'w')) article_calibrations = dict() for name, data in subdomain_data.iteritems(): calibrations = calibrate(data, 'articles', name) print name, calibrations article_calibrations[name] = calibrations user_calibrations = dict() for name, data in subdomain_data.iteritems(): calibrations = calibrate(data, 'users', name) print name, calibrations user_calibrations[name] = calibrations import json import numpy as np import networkx as nx from collections import defaultdict import operator import pandas as pd import scipy.stats as ss !ls geniusdata/ genius_data = {prefix : json.load(open('geniusdata/%s_dictionary.json' % prefix, 'r')) for prefix in ['tag','text','user']} users = set() for text_id, text_dict in genius_data['text'].iteritems(): for user in text_dict['annotating_users']: users.add(user) print len(users) == len(genius_data['user']) #need to reverse the tag dict id_to_tag = dict() for name, tag_id in genius_data['tag'].iteritems(): id_to_tag[tag_id] = name subdomains = defaultdict(dict) for text_id, text_dict in genius_data['text'].iteritems(): tag_id = text_dict['tag_id'] tag_name = id_to_tag[tag_id] subdomains[tag_name][text_id] = text_dict print {subdomain_name: len(subdomain_dict) for subdomain_name, subdomain_dict in subdomains.iteritems()} def make_bipartite_dict(subdomain_dict): return {'t'+text_id: ['u'+str(user) for user in text_dict['annotating_users']] for text_id, text_dict in subdomain_dict.iteritems()} bipartite_dicts = {subdomain_name: make_bipartite_dict(subdomain_dict) for subdomain_name, subdomain_dict in subdomains.iteritems()} def viz_bipartite(bipartite, name): bipartite_G = nx.Graph() text_encountered=list() user_encountered=list() for text, user_list in bipartite.iteritems(): text_encountered.append(text) for user in user_list: bipartite_G.add_edge(text, user) if user not in user_encountered: user_encountered.append(user) M = nx.algorithms.bipartite.basic.biadjacency_matrix(G=bipartite_G, row_order=text_encountered, column_order=user_encountered) #i wish numpy wasn't so procedural so I didn't have to do this, pandas does it right by return the object instead of doing it in place return_M = M.copy() text_dict = {text: text_encountered.index(text) for text in text_encountered} user_dict = {user: user_encountered.index(user) for user in user_encountered} fig = imshow(M, cmap=plt.cm.gray_r, interpolation='nearest') fig.axes.set_title(' "Raw" Text-User Matrix for %s' % name ) fig.axes.set_xlabel('users') fig.axes.set_ylabel('texts') plt.figure(figsize(15,15)) plt.show() M.sort(axis=0) M.sort(axis=1) fig = imshow(M, cmap=plt.cm.gray_r, interpolation='nearest') fig.axes.set_xlim(fig.axes.get_xlim()[::-1]) fig.axes.set_title(' "Sorted" Text-User Matrix for %s' % name ) fig.axes.set_xlabel('users') fig.axes.set_ylabel('texts') plt.figure(figsize(15,15)) plt.show() return return_M, text_dict, user_dict all_user_exogenous = {'u'+user_id: activity for user_id, activity in genius_data['user'].iteritems()} all_text_exogenous = {'t'+text_id: text_dict['annotations_count'] for text_id, text_dict in genius_data['text'].iteritems()} def make_exogenous_ranks(specific_dict, all_dict): exogenous_scores = [(user_id, score) for user_id, score in all_dict.iteritems() if user_id in specific_dict.keys()] exogenous_scores_order = sorted(exogenous_scores, key=lambda tup: tup[1]) exogenous_ranks_order = [tup[0] for tup in exogenous_scores_order] exogenous_ranks = zip(exogenous_ranks_order, range(len(exogenous_ranks_order))) return exogenous_ranks '''for subdomain_name, bipartite_dict in bipartite_dicts.iteritems(): print subdomain_name plt.figure(figsize=(10,10)) bipartite_network = nx.Graph(data=bipartite_dict) #TODO colouring the nodetypes nx.draw_spring(bipartite_network) ''' def load_files(folder): M = np.load(folder+'M.npy') user_dict = json.load(open(folder+'user_dict.json', 'r')) article_dict = json.load(open(folder+'text_dict.json', 'r')) user_exogenous_ranks = json.load(open(folder+'user_exogenous_ranks.json', 'r')) article_exogenous_ranks = json.load(open(folder+'text_exogenous_ranks.json', 'r')) return {'M':M, 'user_dict':user_dict, 'article_dict':article_dict, 'user_exogenous_ranks':user_exogenous_ranks, 'article_exogenous_ranks':article_exogenous_ranks} rap_data = load_files('geniusdata/rap/') def Gcp_denominateur(M, p, k_c, beta): M_p = M[:,p] k_c_beta = k_c ** (-1 * beta) return np.dot(M_p, k_c_beta) def Gpc_denominateur(M, c, k_p, alpha): M_c = M[c,:] k_p_alpha = k_p ** (-1 * alpha) return np.dot(M_c, k_p_alpha) def make_G_hat(M, alpha=1, beta=1): '''G hat is Markov chain of length 2 Gcp is a matrix to go from contries to product and then Gpc is a matrix to go from products to ccountries''' k_c = M.sum(axis=1) #aka k_c summing over the rows k_p = M.sum(axis=0) #aka k_p summering over the columns G_cp = np.zeros(shape=M.shape) #Gcp_beta for [c, p], val in np.ndenumerate(M): numerateur = (M[c,p]) * (k_c[c] ** ((-1) * beta)) denominateur = Gcp_denominateur(M, p, k_c, beta) G_cp[c,p] = numerateur / float(denominateur) G_pc = np.zeros(shape=M.T.shape) #Gpc_alpha for [p, c], val in np.ndenumerate(M.T): numerateur = (M.T[p,c]) * (k_p[p] ** ((-1) * alpha)) denominateur = Gpc_denominateur(M, c, k_p, alpha) G_pc[p,c] = numerateur / float(denominateur) return {'G_cp': G_cp, "G_pc" : G_pc} def w_generator(M, alpha, beta): #this cannot return the zeroeth iteration G_hat = make_G_hat(M, alpha, beta) G_cp = G_hat['G_cp'] G_pc = G_hat['G_pc'] # fitness_0 = np.sum(M,1) ubiquity_0 = np.sum(M,0) fitness_next = fitness_0 ubiquity_next = ubiquity_0 i = 0 while True: fitness_prev = fitness_next ubiquity_prev = ubiquity_next i += 1 fitness_next = np.sum( G_cp*ubiquity_prev, axis=1 ) ubiquity_next = np.sum( G_pc* fitness_prev, axis=1 ) yield {'iteration':i, 'fitness': fitness_next, 'ubiquity': ubiquity_next} def w_stream(M, i, alpha, beta): """gets the i'th iteration of reflections of M, but in a memory safe way so we can calculate many generations""" if i < 0: raise ValueError for j in w_generator(M, alpha, beta): if j[0] == i: return {'fitness': j[1], 'ubiquity': j[2]} break def find_convergence(M, alpha, beta, fit_or_ubiq, do_plot=False,): '''finds the convergence point (or gives up after 1000 iterations)''' if fit_or_ubiq == 'fitness': Mshape = M.shape[0] elif fit_or_ubiq == 'ubiquity': Mshape = M.shape[1] rankings = list() scores = list() prev_rankdata = np.zeros(Mshape) iteration = 0 for stream_data in w_generator(M, alpha, beta): iteration = stream_data['iteration'] data = stream_data[fit_or_ubiq] rankdata = data.argsort().argsort() #test for convergence if np.equal(rankdata,prev_rankdata).all(): break if iteration == 1000: break else: rankings.append(rankdata) scores.append(data) prev_rankdata = rankdata if do_plot: plt.figure(figsize=(iteration/10, Mshape / 20)) plt.xlabel('Iteration') plt.ylabel('Rank, higher is better') plt.title('Rank Evolution') p = semilogx(range(1,iteration), rankings, '-,', alpha=0.5) return {fit_or_ubiq:scores[-1], 'iteration':iteration} def w_star_analytic(M, alpha, beta, w_star_type): k_c = M.sum(axis=1) #aka k_c summing over the rows k_p = M.sum(axis=0) #aka k_p summering over the columns A = 1 B = 1 def Gcp_denominateur(M, p, k_c, beta): M_p = M[:,p] k_c_beta = k_c ** (-1 * beta) return np.dot(M_p, k_c_beta) def Gpc_denominateur(M, c, k_p, alpha): M_c = M[c,:] k_p_alpha = k_p ** (-1 * alpha) return np.dot(M_c, k_p_alpha) if w_star_type == 'w_star_c': w_star_c = np.zeros(shape=M.shape[0]) for c in range(M.shape[0]): summand = Gpc_denominateur(M, c, k_p, alpha) k_beta = (k_c[c] ** (-1 * beta)) w_star_c[c] = A * summand * k_beta return w_star_c elif w_star_type == 'w_star_p': w_star_p = np.zeros(shape=M.shape[1]) for p in range(M.shape[1]): summand = Gcp_denominateur(M, p, k_c, beta) k_alpha = (k_p[p] ** (-1 * alpha)) w_star_p[p] = B * summand * k_alpha return w_star_p #purer python #score w_scores = w_star_analytic(M=rap_data['M'], alpha=0.5, beta=0.5, w_star_type='w_star_p') #identify w_ranks = {name: w_scores[pos] for name, pos in rap_data['user_dict'].iteritems() } #sort w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1)) #or use pandas w_scores_df = pd.DataFrame.from_dict(w_ranks, orient='index') w_scores_df.columns = ['w_score'] w_scores_df.sort(columns=['w_score'], ascending=False).head() convergence = find_convergence(M=rap_data['M'], alpha=0.5, beta=0.5, fit_or_ubiq='fitness', do_plot=True) '''I'm sure this can be done much more elegantly but this was sort-of drink-a-lot-of-coffee-one-afternoon-and-get-it-done cleaning this up is an exercise for the reader''' def rank_comparison(a_ranks_sorted, b_ranks_sorted, do_plot=False): a_list = list() b_list = list() for atup in a_ranks_sorted: aiden = atup[0] apos = atup[1] #find this in our other list for btup in b_ranks_sorted: biden = btup[0] bpos = btup[1] if aiden == biden: a_list.append(apos) b_list.append(bpos) if do_plot: plt.figure(figsize=(10,20)) plot([1,2], [a_list, b_list], '-o') plt.show() return ss.spearmanr(a_list, b_list) def calibrate_analytic(M, ua, exogenous_ranks_sorted, user_or_art_dict, index_function, title, do_plot=False): if ua == 'users': w_star_type = 'w_star_p' elif ua == 'articles': w_star_type = 'w_star_c' squarelen = range(0,25) alpha_range = map(index_function,squarelen) beta_range = map(index_function,squarelen) landscape = np.zeros(shape=(len(list(alpha_range)),len(list(beta_range)))) top_spearman = {'spearman':None,'alpha':None, 'beta':None, 'ua':ua} for alpha_index, alpha in enumerate(alpha_range): for beta_index, beta in enumerate(beta_range): w_converged = w_star_analytic(M, alpha, beta, w_star_type) w_ranks = {name: w_converged[pos] for name, pos in user_or_art_dict.iteritems() } w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1)) spearman = rank_comparison(w_ranks_sorted, exogenous_ranks_sorted) if spearman[1] < 0.05: landscape[alpha_index][beta_index] = spearman[0] if (not top_spearman['spearman']) or (spearman[0] > top_spearman['spearman']): top_spearman['spearman'] = spearman[0] top_spearman['alpha'] = alpha top_spearman['beta'] = beta else: landscape[alpha_index][beta_index] = np.nan if do_plot: plt.figure(figsize=(10,10)) heatmap = imshow(landscape, interpolation='nearest', vmin=-1, vmax=1) #heatmap = plt.pcolor(landscape) colorbar = plt.colorbar(heatmap) plt.xlabel(r'$ \beta $') plt.xticks(squarelen, beta_range, rotation=90) plt.ylabel(r'$ \alpha $') plt.yticks(squarelen, alpha_range) plt.title(title) landscape_file = open(title+'_landscape.npy', 'w') np.save(landscape_file, landscape) plt.savefig(title+'_landscape.eps') return top_spearman def calibrate(data, ua, name): if ua == 'users': exogenous_ranks_sorted, user_or_art_dict = data['user_exogenous_ranks'], data['user_dict'] else: exogenous_ranks_sorted, user_or_art_dict = data['article_exogenous_ranks'], data['article_dict'] spearman = calibrate_analytic(M=data['M'], ua=ua, exogenous_ranks_sorted=exogenous_ranks_sorted, user_or_art_dict=user_or_art_dict, index_function=lambda x: (x-12.5)/6.25, title='Grid Search for %s correlation of %s' % (ua, name), do_plot=True) return {'rho':spearman['spearman'], 'alpha':spearman['alpha'], 'beta':spearman['beta']} subdomain_names = !ls geniusdata/ subdomain_data = {name: load_files('geniusdata/%s/' % name) for name in subdomain_names }