#!/usr/bin/env python # coding: utf-8 # # Node Label Prediction \ Link Prediction # In[1]: import numpy as np import matplotlib.pyplot as plt import scipy as sp import networkx as nx get_ipython().run_line_magic('matplotlib', 'inline') # We will start by node label prediction. Download [this](https://www.dropbox.com/s/sta1krsbnnp7ju0/ppi.CC.gml?dl=0) network. It contains protein communications in Baker’s yeast. Each node (protein) has a special binary attribute *ICSC (intracellular signaling cascade)*. # In[3]: g = nx.read_gml('seminar_prediction/ppi.CC.gml') cc = list(nx.connected_components(g)) g = nx.subgraph(g,cc[0]) g = nx.relabel.convert_node_labels_to_integers(g) # In[4]: labels = np.array(nx.get_node_attributes(g, 'ICSC').values(), dtype=float) # In[5]: nx.draw_spring(g, node_color = labels) # It might not be clear from the picture above but the level of homogeneity is quite high. For each node we are able to compute the average value of label # In[8]: nnICSC = np.asarray(map(lambda(v): np.mean(labels[g.neighbors(v)]), g.nodes_iter())) nnICSC plt.figure(figsize=(10,5)) plt.hist(nnICSC[np.where(labels == 1)], bins=6,) # ## Iterative Classification Method # ICM is kind of NN-classifier. Our prediction is based on the largest ratio of nearest neighbours of labeled nodes. # #### Task 1 # * Randomly set unlabeled nodes. # * Implement classification rule of ICM (HINT look at the code cell above) # * Implement function for classification error and use it wisely # In[9]: lablNN = labels[:] idx = np.random.randint(0,len(lablNN), size=40) lablNN[idx] = np.nan # In[12]: idx = np.nonzero(np.isnan(lablNN))[0] lablClassified = np.asarray(map(lambda(v): np.round(np.nanmean(lablNN[g.neighbors(v)])), idx)) lablNN[idx] = lablClassified lablNN # In[13]: def classError(trueLabel, predLabels): return np.sum(trueLabel != predLabels) # In[14]: classError(labels, lablNN) # ## Label Propagation # Now instead of looking at neigbours we are switching random walks between all the nodes # # Just to recall the Label Propagation method: # 1. Compute $P = D^{-1}A$ # 2. Set $Y^{(0)} = (Y_l,0)$ ($Y_l$ - labeled data) # 3. **repeat** # * $Y^{(t+1)} = PY^{(t)}$ # * $Y_l^{(t+1)} = Y_l$ # 4. **until** $Y^{(t)}$ converges # In[15]: # It is better to initialize like that fixedLabels = labels[:]+1 curLabels = labels[:]+1 # And indicate labeled nodes instead of unlabeled idxLabeled = np.zeros((g.number_of_nodes(),), dtype=bool) idxLabeled[np.random.randint(0,len(labels), size=90)] = True curLabels[~idxLabeled] = 0 # In[16]: def LabelPropagation(G, idxLabeled, curLabels, fixedLabels, iters = 1000): A = np.asarray(nx.adj_matrix(G).todense()) P = np.diag(1.0/sum(A)).dot(A) iterNorm = 1.0 resultLabels = P.dot(curLabels) deltaNorm = 1.0 iterNum = 1 while (deltaNorm > 1e-3 and iterNum < iters and ~np.all(resultLabels >= 1)): iterNorm = np.linalg.norm(resultLabels - curLabels) runLabels, curLabels = resultLabels[:],resultLabels[:] runLabels[idxLabeled] = fixedLabels[idxLabeled] resultLabels = P.dot(runLabels) deltaNorm = np.abs(iterNorm - np.linalg.norm(resultLabels - curLabels)) print deltaNorm iterNum+=1 print 'Converged in {0} iteration'.format(iterNum) return np.round(resultLabels) # In[18]: result = LabelPropagation(g, idxLabeled, curLabels, fixedLabels, iters = 1000) result # In[19]: classError(labels+1, result) # ## Link Prediction - Scoring functions # In this section we will implement some scoring functions for link prediction and compare the values for adjacent and non-adjacent nodes. # # Load [french blog network](https://www.dropbox.com/s/rn0y18a511vfx1t/fblog.gml?dl=0) and compute the following scores: # In[20]: g = nx.read_gml('./seminar_prediction/fblog.gml') vNum = g.number_of_nodes() # In[21]: def matrixPlot(A): plt.figure(1, figsize=(6, 6)) plt.imshow(A, interpolation="none" ) # #### Shortest Path Length # In[22]: # Your code here spScore = nx.shortest_paths.floyd_warshall_numpy(g) matrixPlot(spScore) # #### Number of common neighbours # In[27]: # Your code here nnScore = np.zeros((vNum,vNum)) for i in xrange(vNum): for j in xrange(i+1, vNum): nnScore[i,j] = len(np.intersect1d(nx.neighbors(g,i),nx.neighbors(g,j))) nnScore[0,] # #### Jaccard Score # In[29]: # Your code here jaScore = np.zeros((vNum,vNum)) for i in xrange(vNum): for j in xrange(i+1, vNum): jaScore[i,j] = 1.0*len(np.intersect1d(nx.neighbors(g,i),nx.neighbors(g,j))) / len(np.union1d(nx.neighbors(g,i),nx.neighbors(g,j))) jaScore[0,] # #### Adamic/Adar Score # $Score(a,b) = \sum\limits_{v \in \text{NN}(a) \cap \text{NN}(b)} \frac{1}{\log(|\text{NN}(v)|)}$ # In[31]: # Your code here aaScore = np.zeros((vNum,vNum)) for i in xrange(vNum): for j in xrange(i+1, vNum): for v in nx.common_neighbors(g, i, j): aaScore[i,j] += 1/np.log(len(nx.neighbors(g,v))) aaScore[0,] # #### Preferential Attachment score # $Score(a,b) = |\text{NN}(a)| \times |\text{NN}(b)|$ # In[32]: # Your code here paScore = np.zeros((vNum,vNum)) for i in xrange(vNum): for j in xrange(i+1,vNum): paScore[i,j] = len(nx.neighbors(g,i))*len(nx.neighbors(g,j)) # #### Katz Score # $Score(a,b) = \sum\limits_{\text{Paths}_{a,b}} \beta^{|p_{a,b}|}\times|p_{a,b}|$ # In[ ]: # Your code here katzScore = np.zeros((vNum, vNum)) beta = 0.6 for i in xrange(vNum): for j in xrange(i+1, vNum): l = map(lambda x: beta**len(x)*len(x), tuple(nx.all_simple_paths(g, i, j, cutoff=4))) katzScore[i,j] = np.sum(np.power(beta, l)*l) # Let's compare the scores behavious for pairs of nodes with or without edge in-between # In[42]: A = np.asarray(nx.adj_matrix(g).todense()) xyTriu = np.vstack(np.triu_indices_from(A, k=1)).T wEdge = [nnScore[xy[0],xy[1]] for xy in xyTriu if A[xy[0],xy[1]]] woEdge = [nnScore[xy[0],xy[1]] for xy in xyTriu if ~A[xy[0],xy[1]]] data = [wEdge, woEdge] # In[43]: fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10,5)) axes.violinplot(data, showmeans=True) axes.set_xticklabels(['', 'With Edges', '', 'W/o Edges'])