user_df = pd.DataFrame.from_dict(user_calibrations, orient='index')
user_df

article_df = pd.DataFrame.from_dict(article_calibrations, orient='index')
article_df

for name, bipartite_dict in bipartite_dicts.iteritems():
    M, text_dict, user_dict = viz_bipartite(bipartite_dict, name)

    np.save('geniusdata/'+name+'/M.npy', M)
    json.dump(text_dict, open('geniusdata/'+name+'/text_dict.json', 'w'))
    json.dump(user_dict, open('geniusdata/'+name+'/user_dict.json', 'w'))

    text_exogenous_ranks = make_exogenous_ranks(text_dict, all_text_exogenous)
    user_exogenous_ranks = make_exogenous_ranks(user_dict, all_user_exogenous)

    json.dump(text_exogenous_ranks, open('geniusdata/'+name+'/text_exogenous_ranks.json', 'w'))
    json.dump(user_exogenous_ranks, open('geniusdata/'+name+'/user_exogenous_ranks.json', 'w'))
        

article_calibrations = dict()
for name, data in subdomain_data.iteritems():
    calibrations = calibrate(data, 'articles', name)
    print name, calibrations
    article_calibrations[name] = calibrations

user_calibrations = dict()
for name, data in subdomain_data.iteritems():
    calibrations = calibrate(data, 'users', name)
    print name, calibrations
    user_calibrations[name] = calibrations

import json
import numpy as np
import networkx as nx
from collections import defaultdict
import operator
import pandas as pd
import scipy.stats as ss

!ls geniusdata/

genius_data = {prefix : json.load(open('geniusdata/%s_dictionary.json' % prefix, 'r')) for prefix in ['tag','text','user']}

users = set()
for text_id, text_dict in genius_data['text'].iteritems():
    for user in text_dict['annotating_users']:
        users.add(user)
        
print len(users) == len(genius_data['user'])

#need to reverse the tag dict
id_to_tag = dict()
for name, tag_id in genius_data['tag'].iteritems():
    id_to_tag[tag_id] = name

subdomains = defaultdict(dict)
for text_id, text_dict in genius_data['text'].iteritems():
    tag_id = text_dict['tag_id']
    tag_name = id_to_tag[tag_id]
    subdomains[tag_name][text_id] = text_dict
    

print {subdomain_name: len(subdomain_dict) for subdomain_name, subdomain_dict in subdomains.iteritems()}

def make_bipartite_dict(subdomain_dict):
    return {'t'+text_id: ['u'+str(user) for user in  text_dict['annotating_users']] for text_id, text_dict in subdomain_dict.iteritems()}

bipartite_dicts = {subdomain_name: make_bipartite_dict(subdomain_dict) for subdomain_name, subdomain_dict in subdomains.iteritems()}

def viz_bipartite(bipartite, name):
    bipartite_G = nx.Graph()
    text_encountered=list()
    user_encountered=list()

    for text, user_list in bipartite.iteritems():
        text_encountered.append(text)
        for user in user_list:
            bipartite_G.add_edge(text, user)
            if user not in user_encountered:
                user_encountered.append(user)
    
            
    M = nx.algorithms.bipartite.basic.biadjacency_matrix(G=bipartite_G, row_order=text_encountered, column_order=user_encountered)

    #i wish numpy wasn't so procedural so I didn't have to do this, pandas does it right by return the object instead of doing it in place
    return_M = M.copy()
    
    text_dict = {text: text_encountered.index(text) for text in text_encountered}
    user_dict = {user: user_encountered.index(user) for user in user_encountered}

    fig = imshow(M, cmap=plt.cm.gray_r, interpolation='nearest')
    fig.axes.set_title(' "Raw" Text-User Matrix for %s' % name )
    fig.axes.set_xlabel('users')
    fig.axes.set_ylabel('texts')
    plt.figure(figsize(15,15))
    plt.show()
    
    
    M.sort(axis=0)
    M.sort(axis=1)
    
    fig = imshow(M, cmap=plt.cm.gray_r, interpolation='nearest')
    fig.axes.set_xlim(fig.axes.get_xlim()[::-1]) 
    fig.axes.set_title(' "Sorted" Text-User Matrix for %s' % name )
    fig.axes.set_xlabel('users')
    fig.axes.set_ylabel('texts')
    plt.figure(figsize(15,15))
    plt.show()
    return return_M, text_dict, user_dict

all_user_exogenous = {'u'+user_id: activity for user_id, activity in genius_data['user'].iteritems()}
all_text_exogenous = {'t'+text_id: text_dict['annotations_count'] for text_id, text_dict in genius_data['text'].iteritems()}

def make_exogenous_ranks(specific_dict, all_dict):
    exogenous_scores = [(user_id, score) for user_id, score in all_dict.iteritems() if user_id in specific_dict.keys()]
    exogenous_scores_order = sorted(exogenous_scores, key=lambda tup: tup[1])
    exogenous_ranks_order  = [tup[0] for tup in exogenous_scores_order]
    exogenous_ranks = zip(exogenous_ranks_order, range(len(exogenous_ranks_order)))
    return exogenous_ranks

'''for subdomain_name, bipartite_dict in bipartite_dicts.iteritems():
    print subdomain_name
    plt.figure(figsize=(10,10))
    bipartite_network = nx.Graph(data=bipartite_dict)
    #TODO colouring the nodetypes
    nx.draw_spring(bipartite_network)
    '''

def load_files(folder):
    M = np.load(folder+'M.npy')
    user_dict = json.load(open(folder+'user_dict.json', 'r'))
    article_dict = json.load(open(folder+'text_dict.json', 'r')) 
    user_exogenous_ranks = json.load(open(folder+'user_exogenous_ranks.json', 'r'))
    article_exogenous_ranks = json.load(open(folder+'text_exogenous_ranks.json', 'r'))
    return {'M':M,
            'user_dict':user_dict,
            'article_dict':article_dict,
            'user_exogenous_ranks':user_exogenous_ranks, 
            'article_exogenous_ranks':article_exogenous_ranks}

rap_data = load_files('geniusdata/rap/')

def Gcp_denominateur(M, p, k_c, beta):
    M_p = M[:,p]
    k_c_beta = k_c ** (-1 * beta)
    return np.dot(M_p, k_c_beta)

def Gpc_denominateur(M, c, k_p, alpha):
    M_c = M[c,:]
    k_p_alpha = k_p ** (-1 * alpha)
    return np.dot(M_c, k_p_alpha)


def make_G_hat(M, alpha=1, beta=1):
    '''G hat is Markov chain of length 2
    Gcp is a matrix to go from  contries to product and then 
    Gpc is a matrix to go from products to ccountries'''
    
    k_c  = M.sum(axis=1) #aka k_c summing over the rows
    k_p = M.sum(axis=0) #aka k_p summering over the columns
    
    G_cp = np.zeros(shape=M.shape)
    #Gcp_beta
    for [c, p], val in np.ndenumerate(M):
        numerateur = (M[c,p]) * (k_c[c] ** ((-1) * beta))
        denominateur = Gcp_denominateur(M, p, k_c, beta)
        G_cp[c,p] = numerateur / float(denominateur)
    
    
    G_pc = np.zeros(shape=M.T.shape)
    #Gpc_alpha
    for [p, c], val in np.ndenumerate(M.T):
        numerateur = (M.T[p,c]) * (k_p[p] ** ((-1) * alpha))
        denominateur = Gpc_denominateur(M, c, k_p, alpha)
        G_pc[p,c] = numerateur / float(denominateur)
    
    
    return {'G_cp': G_cp, "G_pc" : G_pc}

def w_generator(M, alpha, beta):
    #this cannot return the zeroeth iteration
    
    G_hat = make_G_hat(M, alpha, beta)
    G_cp = G_hat['G_cp']
    G_pc = G_hat['G_pc']
    #

    fitness_0  = np.sum(M,1)
    ubiquity_0 = np.sum(M,0)
    
    fitness_next = fitness_0
    ubiquity_next = ubiquity_0
    i = 0
    
    while True:
        
        fitness_prev = fitness_next
        ubiquity_prev = ubiquity_next
        i += 1
        
        fitness_next = np.sum( G_cp*ubiquity_prev, axis=1 )
        ubiquity_next = np.sum( G_pc* fitness_prev, axis=1 )
        
        yield {'iteration':i, 'fitness': fitness_next, 'ubiquity': ubiquity_next}
        

def w_stream(M, i, alpha, beta):
    """gets the i'th iteration of reflections of M, 
    but in a memory safe way so we can calculate many generations"""
    if i < 0:
        raise ValueError
    for j in w_generator(M, alpha, beta):
        if j[0] == i:
            return {'fitness': j[1], 'ubiquity': j[2]}
            break
            
def find_convergence(M, alpha, beta, fit_or_ubiq, do_plot=False,):
    '''finds the convergence point (or gives up after 1000 iterations)'''
    if fit_or_ubiq == 'fitness':
        Mshape = M.shape[0]
    elif fit_or_ubiq == 'ubiquity':
        Mshape = M.shape[1]
    
    rankings = list()
    scores = list()
    
    prev_rankdata = np.zeros(Mshape)
    iteration = 0

    for stream_data in w_generator(M, alpha, beta):
        iteration = stream_data['iteration']
        
        data = stream_data[fit_or_ubiq]
        rankdata = data.argsort().argsort()
        
        #test for convergence
        if np.equal(rankdata,prev_rankdata).all():
            break
        if iteration == 1000:
            break
        else:
            rankings.append(rankdata)
            scores.append(data)
            prev_rankdata = rankdata
            
    if do_plot:
        plt.figure(figsize=(iteration/10, Mshape / 20))
        plt.xlabel('Iteration')
        plt.ylabel('Rank, higher is better')
        plt.title('Rank Evolution')
        p = semilogx(range(1,iteration), rankings, '-,', alpha=0.5)
    return {fit_or_ubiq:scores[-1], 'iteration':iteration}

def w_star_analytic(M, alpha, beta, w_star_type):
    k_c  = M.sum(axis=1) #aka k_c summing over the rows
    k_p = M.sum(axis=0) #aka k_p summering over the columns
    
    A = 1
    B = 1
    
    def Gcp_denominateur(M, p, k_c, beta):
        M_p = M[:,p]
        k_c_beta = k_c ** (-1 * beta)
        return np.dot(M_p, k_c_beta)
    
    def Gpc_denominateur(M, c, k_p, alpha):
        M_c = M[c,:]
        k_p_alpha = k_p ** (-1 * alpha)
        return np.dot(M_c, k_p_alpha)
    
    if w_star_type == 'w_star_c':
        w_star_c = np.zeros(shape=M.shape[0])

        for c in range(M.shape[0]):
            summand = Gpc_denominateur(M, c, k_p, alpha)
            k_beta = (k_c[c] ** (-1 * beta))
            w_star_c[c] = A * summand * k_beta

        return w_star_c
    
    elif w_star_type == 'w_star_p':
        w_star_p = np.zeros(shape=M.shape[1])
    
        for p in range(M.shape[1]):
            summand = Gcp_denominateur(M, p, k_c, beta)
            k_alpha = (k_p[p] ** (-1 * alpha))
            w_star_p[p] = B * summand * k_alpha
    
        return w_star_p

#purer python
#score
w_scores = w_star_analytic(M=rap_data['M'], alpha=0.5, beta=0.5, w_star_type='w_star_p')
#identify
w_ranks = {name: w_scores[pos] for name, pos in rap_data['user_dict'].iteritems() }
#sort
w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1))

#or use pandas
w_scores_df = pd.DataFrame.from_dict(w_ranks, orient='index')
w_scores_df.columns = ['w_score']
w_scores_df.sort(columns=['w_score'], ascending=False).head()

convergence = find_convergence(M=rap_data['M'], alpha=0.5, beta=0.5, fit_or_ubiq='fitness', do_plot=True)

'''I'm sure this can be done much more elegantly
but this was sort-of drink-a-lot-of-coffee-one-afternoon-and-get-it-done
cleaning this up is an exercise for the reader'''

def rank_comparison(a_ranks_sorted, b_ranks_sorted, do_plot=False):
    a_list = list()
    b_list = list()
    for atup in a_ranks_sorted:
        aiden = atup[0]
        apos = atup[1]
        #find this in our other list
        for btup in b_ranks_sorted:
            biden = btup[0]
            bpos = btup[1]
            if aiden == biden:
                a_list.append(apos)
                b_list.append(bpos)
    if do_plot:    
        plt.figure(figsize=(10,20))
        plot([1,2], [a_list, b_list], '-o')
        plt.show()
    
    return ss.spearmanr(a_list, b_list)

def calibrate_analytic(M, ua, exogenous_ranks_sorted, user_or_art_dict, index_function, title, do_plot=False):
    
    if ua == 'users':
        w_star_type = 'w_star_p'
    elif ua == 'articles':
        w_star_type = 'w_star_c'
    
    squarelen = range(0,25)
    
    alpha_range = map(index_function,squarelen)
    beta_range = map(index_function,squarelen)
    landscape = np.zeros(shape=(len(list(alpha_range)),len(list(beta_range))))

    top_spearman = {'spearman':None,'alpha':None, 'beta':None, 'ua':ua}

    for alpha_index, alpha in enumerate(alpha_range):
        for beta_index, beta in enumerate(beta_range):
            
            w_converged = w_star_analytic(M, alpha, beta, w_star_type)
            
            w_ranks = {name: w_converged[pos] for name, pos in user_or_art_dict.iteritems() }
            w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1))
            
            spearman = rank_comparison(w_ranks_sorted, exogenous_ranks_sorted)

            if spearman[1] < 0.05:
                landscape[alpha_index][beta_index] = spearman[0]
                
                if (not top_spearman['spearman']) or (spearman[0] > top_spearman['spearman']):
                    top_spearman['spearman'] = spearman[0]
                    top_spearman['alpha'] = alpha
                    top_spearman['beta'] = beta
            else:
                landscape[alpha_index][beta_index] = np.nan

    if do_plot:
        plt.figure(figsize=(10,10))
        heatmap = imshow(landscape, interpolation='nearest', vmin=-1, vmax=1)
        #heatmap = plt.pcolor(landscape)
        colorbar = plt.colorbar(heatmap)
        plt.xlabel(r'$ \beta $')
        plt.xticks(squarelen, beta_range, rotation=90)
        plt.ylabel(r'$ \alpha $')
        plt.yticks(squarelen, alpha_range)
        plt.title(title)
        
        landscape_file = open(title+'_landscape.npy', 'w')
        np.save(landscape_file, landscape)
        plt.savefig(title+'_landscape.eps')

    return top_spearman

def calibrate(data, ua, name):
    if ua == 'users':
        exogenous_ranks_sorted, user_or_art_dict = data['user_exogenous_ranks'], data['user_dict']
    else:
        exogenous_ranks_sorted, user_or_art_dict = data['article_exogenous_ranks'], data['article_dict']

    spearman = calibrate_analytic(M=data['M'],
                                       ua=ua,
                                       exogenous_ranks_sorted=exogenous_ranks_sorted,
                                       user_or_art_dict=user_or_art_dict,
                                       index_function=lambda x: (x-12.5)/6.25, 
                                       title='Grid Search for %s correlation of %s' % (ua, name),
                                       do_plot=True)

    return {'rho':spearman['spearman'], 'alpha':spearman['alpha'], 'beta':spearman['beta']}

subdomain_names = !ls geniusdata/

subdomain_data = {name: load_files('geniusdata/%s/' % name) for name in subdomain_names }