#!/usr/bin/env python
# coding: utf-8

# # Node Label Prediction \ Link Prediction

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import networkx as nx
get_ipython().run_line_magic('matplotlib', 'inline')


# We will start by node label prediction. Download [this](https://www.dropbox.com/s/sta1krsbnnp7ju0/ppi.CC.gml?dl=0) network. It contains protein communications in Baker’s yeast. Each node (protein) has a special binary attribute *ICSC (intracellular signaling cascade)*.

# In[3]:


g = nx.read_gml('seminar_prediction/ppi.CC.gml')
cc = list(nx.connected_components(g))
g = nx.subgraph(g,cc[0])
g = nx.relabel.convert_node_labels_to_integers(g)


# In[4]:


labels = np.array(nx.get_node_attributes(g, 'ICSC').values(), dtype=float)


# In[5]:


nx.draw_spring(g, node_color = labels)


# It might not be clear from the picture above but the level of homogeneity is quite high. For each node we are able to compute the average value of label

# In[8]:


nnICSC = np.asarray(map(lambda(v): np.mean(labels[g.neighbors(v)]), g.nodes_iter()))
nnICSC
plt.figure(figsize=(10,5))
plt.hist(nnICSC[np.where(labels == 1)], bins=6,)


# ## Iterative Classification Method

# ICM is kind of NN-classifier. Our prediction is based on the largest ratio of nearest neighbours of labeled nodes.

# #### Task 1

# * Randomly set unlabeled nodes. 
# * Implement classification rule of ICM (HINT look at the code cell above)
# * Implement function for classification error and use it wisely

# In[9]:


lablNN = labels[:]
idx = np.random.randint(0,len(lablNN), size=40)
lablNN[idx] = np.nan


# In[12]:


idx = np.nonzero(np.isnan(lablNN))[0]
lablClassified = np.asarray(map(lambda(v): np.round(np.nanmean(lablNN[g.neighbors(v)])), idx))
lablNN[idx] = lablClassified
lablNN


# In[13]:


def classError(trueLabel, predLabels):
    return np.sum(trueLabel != predLabels)


# In[14]:


classError(labels, lablNN)


# ## Label Propagation

# Now instead of looking at neigbours we are switching random walks between all the nodes
# 
# Just to recall the Label Propagation method:
# 1. Compute $P = D^{-1}A$
# 2. Set $Y^{(0)} = (Y_l,0)$ ($Y_l$ - labeled data)
# 3. **repeat**
#     * $Y^{(t+1)} = PY^{(t)}$
#     * $Y_l^{(t+1)} = Y_l$
# 4. **until** $Y^{(t)}$ converges

# In[15]:


# It is better to initialize like that
fixedLabels = labels[:]+1
curLabels = labels[:]+1

# And indicate labeled nodes instead of unlabeled
idxLabeled = np.zeros((g.number_of_nodes(),), dtype=bool)
idxLabeled[np.random.randint(0,len(labels), size=90)] = True
curLabels[~idxLabeled] = 0


# In[16]:


def LabelPropagation(G, idxLabeled, curLabels, fixedLabels, iters = 1000):
    A = np.asarray(nx.adj_matrix(G).todense())
    P = np.diag(1.0/sum(A)).dot(A)
    
    iterNorm = 1.0
    resultLabels = P.dot(curLabels)
    deltaNorm = 1.0
    iterNum = 1
    
    while (deltaNorm > 1e-3 and iterNum < iters and ~np.all(resultLabels >= 1)):
        iterNorm = np.linalg.norm(resultLabels - curLabels)
        runLabels, curLabels = resultLabels[:],resultLabels[:]
        runLabels[idxLabeled] = fixedLabels[idxLabeled]
        resultLabels = P.dot(runLabels)
        deltaNorm = np.abs(iterNorm - np.linalg.norm(resultLabels - curLabels))
        print deltaNorm
        iterNum+=1
    print 'Converged in {0} iteration'.format(iterNum)
    return np.round(resultLabels)


# In[18]:


result = LabelPropagation(g, idxLabeled, curLabels, fixedLabels, iters = 1000)
result


# In[19]:


classError(labels+1, result)


# ## Link Prediction - Scoring functions

# In this section we will implement some scoring functions for link prediction and compare the values for adjacent and non-adjacent nodes.
# 
# Load [french blog network](https://www.dropbox.com/s/rn0y18a511vfx1t/fblog.gml?dl=0) and compute the following scores:

# In[20]:


g = nx.read_gml('./seminar_prediction/fblog.gml')
vNum = g.number_of_nodes()


# In[21]:


def matrixPlot(A):
    plt.figure(1, figsize=(6, 6))
    plt.imshow(A,
           interpolation="none"
           )


# #### Shortest Path Length

# In[22]:


# Your code here
spScore = nx.shortest_paths.floyd_warshall_numpy(g)
matrixPlot(spScore)


# #### Number of common neighbours

# In[27]:


# Your code here
nnScore = np.zeros((vNum,vNum))
for i in xrange(vNum):
    for j in xrange(i+1, vNum):
        nnScore[i,j] = len(np.intersect1d(nx.neighbors(g,i),nx.neighbors(g,j)))
nnScore[0,]


# #### Jaccard Score

# In[29]:


# Your code here
jaScore = np.zeros((vNum,vNum))
for i in xrange(vNum):
    for j in xrange(i+1, vNum):
        jaScore[i,j] = 1.0*len(np.intersect1d(nx.neighbors(g,i),nx.neighbors(g,j))) / len(np.union1d(nx.neighbors(g,i),nx.neighbors(g,j)))
jaScore[0,]


# #### Adamic/Adar Score

# $Score(a,b) = \sum\limits_{v \in \text{NN}(a) \cap \text{NN}(b)} \frac{1}{\log(|\text{NN}(v)|)}$

# In[31]:


# Your code here
aaScore = np.zeros((vNum,vNum))
for i in xrange(vNum):
    for j in xrange(i+1, vNum):
        for v in nx.common_neighbors(g, i, j):
            aaScore[i,j] += 1/np.log(len(nx.neighbors(g,v)))
aaScore[0,]


# #### Preferential Attachment score

# $Score(a,b) = |\text{NN}(a)| \times |\text{NN}(b)|$

# In[32]:


# Your code here
paScore = np.zeros((vNum,vNum))
for i in xrange(vNum):
    for j in xrange(i+1,vNum):
        paScore[i,j] = len(nx.neighbors(g,i))*len(nx.neighbors(g,j))


# #### Katz Score

# $Score(a,b) = \sum\limits_{\text{Paths}_{a,b}} \beta^{|p_{a,b}|}\times|p_{a,b}|$

# In[ ]:


# Your code here
katzScore = np.zeros((vNum, vNum))
beta = 0.6
for i in xrange(vNum):
    for j in xrange(i+1, vNum):
        l = map(lambda x: beta**len(x)*len(x), tuple(nx.all_simple_paths(g, i, j, cutoff=4)))
        katzScore[i,j] = np.sum(np.power(beta, l)*l)


# Let's compare the scores behavious for pairs of nodes with or without edge in-between

# In[42]:


A = np.asarray(nx.adj_matrix(g).todense())
xyTriu = np.vstack(np.triu_indices_from(A, k=1)).T
wEdge = [nnScore[xy[0],xy[1]] for xy in xyTriu if A[xy[0],xy[1]]]
woEdge = [nnScore[xy[0],xy[1]] for xy in xyTriu if ~A[xy[0],xy[1]]]
data = [wEdge, woEdge]


# In[43]:


fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
axes.violinplot(data, showmeans=True)
axes.set_xticklabels(['', 'With Edges', '', 'W/o Edges'])