#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd

import numpy as np
from __future__ import division
import itertools

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.grid'] = False

import logging
logger = logging.getLogger()


# 5 Link Analysis
# ========
# 
# ### 5.1 PageRank
# #### 5.1.1 Eearly Search Engines and Term Spam
# **inverted index**:     
# a data structure that makes it easy to find all the palces where that a term given occurs.
# 
# **term spam**:    
# techniques for fooling search engines.
# 
# To combat term spam, Google introduced two innovations:
# 
# 1. PageRank was used to simulate where Web surfers.  $\gets$ random surfer.
# 
# 2. Use those links that links to the page.  $\gets$ "vote with their feet."
# 
# 
# #### 5.1.2 Definiton of PageRank
# intent: the higher the PageRank of a page, the more "important" it is.
# 
# **transition matrix**:    
# to describe what happens to random surfers after one stop.    
# $m_{ij} = \frac{1}{k}$ if page $j$ has $k$ arcs out, and one of them is to page $i$.     
# Otherwise, $m_{ij} = 0$.

# In[2]:


plt.imshow(plt.imread('./res/fig_5_1.png'))


# In[3]:


plt.imshow(plt.imread('./res/eg_5_1.png'))


# PageRank $v$ simulate random surfers: 
# 
# 1. start at a random page of all $n$.    
#    $v_i^0 = \frac{1}{n} \quad i = 1, 2, \dotsc, n$.
#    
# 2. randomly choose next page linked.     
#    $v^{k+1} = M v^{k}$     
#    give us the distribution of the surfer after $k+1$ stpes.

# In[4]:


# eg. 5.1
matrix_5_1 = np.array([
                [0, 1/3, 1/3, 1/3],
                [1/2, 0, 0, 1/2],
                [1, 0, 0, 0],
                [0, 1/2, 1/2, 0]
             ]).T

matrix_5_1


# In[5]:


n = matrix_5_1.shape[1]

v = np.ones((n,1)) / n
v


# In[6]:


def dist_after_surfing(M, v=None, steps=1):
    if v is None:
        n = M.shape[1]
        v = np.ones((n,1)) / n
        
    for __ in xrange(steps):
        v = M.dot(v)
        
    return v

steps = 1
v_ = dist_after_surfing(matrix_5_1, v, steps)
v_


# **Markov processes**:     
# It is known that the distribution of the surfer approaches a limiting distribution $v$ that satisfies $v = Mv$, provided two conditions are met: 
# 
# 1. The graph is trongly connnected.     
#    namely, it is possible to get from any node to any other node.
#    
# 2. There are no dead ends.
# 
# eigenvalue and eigenvector: $Mx = \lambda x$    
# $v = x : \lambda = 1 \text{ and } \sum_j x_{ij} = 1 \text{ for every } i$.
# 
# $v$ is a [stochastic vector](http://www.math.harvard.edu/~knill/teaching/math19b_2011/handouts/lecture33.pdf), and $M$ is a [stochastic/Markov matrix](https://en.wikipedia.org/wiki/Stochastic_matrix).
# 
# **Iteration** $O(M^2)$ is better than **Gaussian elimination** $O(M^3)$.
# 
# in practice, for the Web itself, 50-75 itreations are sufficient to converge to within the error limits of double-precision arithmetic.

# In[7]:


# eg 5.2
v_ = dist_after_surfing(matrix_5_1, v, 10)
v_


# In[8]:


v_ = dist_after_surfing(matrix_5_1, v, 50)
v_


# In[9]:


v_ = dist_after_surfing(matrix_5_1, v, 75)
v_


# #### 5.1.3 Structure of the Web
# Some structures in reality violate the assumptions needed for the Markov-process iteration to converge to a limit.

# In[10]:


plt.figure(figsize = (10,10))
plt.imshow(plt.imread('./res/fig_5_2.png'))


# **Two problems** we need to avoid by modifing PageRank:
# 
# 1. the dead end.
# 
# 2. spider traps.     
#    the groups of pages that all have outlinks but they never link to any other pages.

# #### 5.1.4 Avoiding Dead Ends
# Dead Ends $\to$ $M$ is no longer stochastic, since some of the columns will sum to 0 rather than 1.
# 
# If we compute $M^iv$ and $M$ is substochastic, then some or all of the components of the vector go to 0.

# In[11]:


# eg 5.3
plt.imshow(plt.imread('./res/fig_5_3.png'))


# In[12]:


M = np.array([
        [0, 1/3, 1/3, 1/3],
        [1/2, 0, 0, 1/2],
        [0, 0, 0, 0],
        [0, 1/2, 1/2, 0]
    ]).T

M


# In[13]:


dist_after_surfing(M, v, 50)


# ##### Two Solutions
# ###### 1. Drop the dead end      
#    recursive deletion of dead ends, and solve the remaining graph $G'$.     
#    then we restore $G$ from $G'$, recursivly.     
#    $$ e = \sum \frac{v_p}{k_p} $$    
#    where $e \in (G - G')$, $p \in G'$ and is the predecessor of $e$, $k$ is the number of successors of $p$ in $G$.

# In[14]:


# eg 5.4
plt.imshow(plt.imread('./res/fig_5_4.png'))


# In[15]:


M_G = np.array([
        [0, 1/3, 1/3, 1/3, 0],
        [1/2, 0, 0, 1/2, 0],
        [0, 0, 0, 0, 1],
        [0, 1/2, 1/2, 0, 0],
        [0, 0, 0, 0, 0]
    ]).T
M_G


# In[16]:


from sklearn.preprocessing import normalize 

index = [0, 1, 3]
M = M_G.take(index, axis=0).take(index, axis=1)
M = normalize(M, norm='l1', axis=0) 
M


# In[17]:


n = M.shape[1]
v = np.ones((n,1)) / n

v_ = dist_after_surfing(M, v, 50)
v_


# In[18]:


# the PageRank of C, whose predecessors are A and D.
1/3 * v_[0] + 1/2 * v_[2]


# In[19]:


# the PageRank of D, which is the only one successor of C.
# hence, it is the same with C.
_


# ###### 2. modify the process of moving
# "taxation"

# #### 5.1.5 Spider Traps and Taxation
# **spider traps**:     
# a set of nodes with no dead ends but no arcs out.      
# they cause the PageRank calculation to place all the weights within the spider traps.

# In[20]:


plt.imshow(plt.imread('./res/fig_5_6.png'))


# In[21]:


M = np.array([
        [0, 1/3, 1/3, 1/3],
        [1/2, 0, 0, 1/2],
        [0, 0, 1, 0],
        [0, 1/2, 1/2, 0]
    ]).T
M


# In[22]:


np.round(dist_after_surfing(M, steps=50), 3)


# **Solution**:       
# allow each random surfer a small probability of teleporting to a random page.     
# $$v = \beta M v + (1 - \beta) \frac{e}{n}$$     
# where $n$ is the number of nodes in $G$, and $e$ is a vector of all 1's.

# In[23]:


def dist_using_taxation(M, v=None, beta=1, steps=1):
    n = M.shape[1]
    
    if v is None:
        v = np.ones((n,1)) / n
    
    e = np.ones(v.shape) 
    for __ in xrange(steps):
        v = beta * M.dot(v) + (1-beta) * e / n

    return v

dist_using_taxation(M, beta=0.8, steps=30) 


# Although C gets more than half of the PageRank for itself, the effect has been limited.
# 
# **Note** that for a random surfer, there are **three path** to move:
# 
# 1. follow a link.
# 
# 2. teleport to a random page. $\gets$ taxation
# 
# 3. goes nowhere. $\gets$ dead ends
# 
# Since there will always be some fraction of a surfer operating on the Web, so even if there are dead ends, the sum of the ocmponents of $v$ may be less than 1, but it will never reacher 0.

# #### 5.1.6 Using PageRank in a Search Engine
# 1. find the qualified pages, which have at least one of the search terms in the query.
# 
# 2. calculate a score for those pages, including PageRank.

# #### 5.1.7 Exercises
# 
# ###### 5.1.1 
# 略
# 
# ###### 5.1.2 
# 略
# 
# ###### 5.1.3
# $n$ nodes: 1/n    
# the additional node: n * 1/n * 1/n = 1/n
# 
# ###### 5.1.4
# todo
# 
# ###### 5.1.5
# 略
# 
# ###### 5.1.6
# the first node: 1    
# the left nodes: 1/2
# 
# ###### 5.1.7
# root: 1    
# height = 1:  1/3      
# height = 2:  1/3 * 1/2      
# height = k:  $\frac{1}{3} \times (\frac{1}{2})^{k-1}, k > 1$

# ### 5.2 Efficient Computation of PageRank 
# PageRank: matrix-vector multiplicaton $\to$ MapReduce.     
# However, we must deal with two issues:
# 
# 1. $M$ is very sparse.     
#    list its nonzero elements.
# 
# 2. we may wish to use a combiner to reduce the amount of data (Map $\to$ Reduce).     
#    striping approach.

# #### 5.2.1 Representing Transition Matrices
# The proper way to represent any sparse matrix is to list the **locations** and **values** of the nonzero entries.    
# The space needed is linear in the number of nonzero entries.
# 
# Represent a column by:    
#     1. one integer for the out-degree,      
#     2. one integer for rowname per nonzero entry.

# In[24]:


matrix_5_1


# In[25]:


import string

df_M = pd.DataFrame(matrix_5_1, index=list(string.uppercase[0:4]), columns=list(string.uppercase[0:4]))
df_M


# In[26]:


def compact_representation_of_sparse_matrix(df):
    """It is introduced in Example 5.7"""
    
    degree = df.apply(np.count_nonzero, axis=0) 
    
    dest = df.apply(np.nonzero, axis=0) 
    dest = dest.apply(lambda x: x[0])
    
    return pd.concat([degree, dest], axis=1, keys=['Degree', 'Destinations']) 
    
    
compact_representation_of_sparse_matrix(df_M)


# #### 5.2.2 PageRank Iteration Using MapReduce
# estimate: $$v' = \beta M v + (1 - \beta) \frac{e}{n}$$
# 
# if $v$ is much too large to fit in main memory, we could use the method of triping:    
# break $M $into stripes and break $v$ into corresponding horizontal stripes.

# #### 5.2.3 Use of Combiners to Consolidate the Result Vector
# There are two reasons the method of Section 5.2.2 might not be adequate:    
# 
# 1. We might wish to add term for $v'_i$.
#     
# 2. We might execute the iteration step at a single machine or a collection of machines instead.
# 
# Hence, we need to implement a combiner in conjunction with a Map task.
# 
# **thrashing**:     
# repeatly read some components of $v'$.      
# An alternative strategy is based on partitioning $M$ into $k^2$ blocks.

# In[27]:


plt.imshow(plt.imread('./res/fig_5_12.png'))


# Each task gets $M_{ij}$ and $v_j$.    
# Thus, $v$ is transmitted over the network $k$ times, but $M$ is sent only once.
# 
# The advantage of this approach is:     
# We can keep both the $v_j$ and $v'_i$ in main memory as we process $M_{ij}$.

# ####  5.2.4 Representing Blocks of the Transition Matrix
# for each columns of $M_{ij}$, we need list its out-degree and rows of nonzero entry.     
# And it might lead to that the out-degree will be repeated as many times as the out-degree itself.

# In[28]:


plt.figure(figsize=(10,10))
plt.imshow(plt.imread('./res/fig_5_14.png'))

#todo


# #### 5.2.5 Other Efficient Approaches to PageRank Iteration
# We can assign all the blocks of one row of blocks to a single Map task,      
# namely, use blocks $M_{i1}$ through $M_{ik}$ and all of $v$ to compute $v'i$.

# #### 5.2.6 Exercise
# 
# ###### ex 5.2.1
# the fraction of 1's should be less than $\frac{log_2 n}{n}$.
# 
# ###### ex 5.2.2
# 略
# 
# ###### ex 5.2.3
# 略
# 
# ###### ex 5.2.4
# k = n

# ### 5.3 Topic-Sensitive PageRank
# ###### improvement     
# we can weight certain pages more heavily because of their topic.
# 
# ###### motivation    
# 1. Different people have different interests, and sometimes distinct interests are expressed using the same term in a query.
# 
# 2. It is not feasible to build a private PageRank vector for each user.
# 
# **solution**:     
# create one vector for each of some small number of topics, biasing the PageRank to favor pages of that topic,      
# and classif users accroding to the degree of their interests in each of the selected topics.      
# 
# pons: only store a short vector for each user.      
# cons: lose some accuracy.
# 
# _note_: One useful topic set is the 16 top-level categories of the Open Directory ([DMOZ](www.dmoz.org)).

# #### 5.3.2 Biased Random Walks
# **Intuition**:    
# Pages linked to by sports pages are themselves likely to be about sports.      
# 
# The only difference is how we add the new surfers: 
# $$v' = \beta M v + (1 - \beta) \frac{e_s}{|S|}$$
# where $S$ is the teleport set (the pages identified as belonging to certain topic), and $e_s$ is a vector that has 1 in the components in $S$ and 0 in other components.

# In[29]:


plt.imshow(plt.imread('./res/fig_5_15.png'))


# In[30]:


beta = 0.8
M = matrix_5_1
S = ['B', 'D'] 

e_s = pd.Series(np.zeros(4), index=list(string.uppercase[0:4]))
for s in S:
    e_s[s] = 1

e_s


# In[31]:


M


# In[32]:


print('v = \n{} v \n+ \n{}'.format(beta*M, (1-beta)*e_s/np.sum(e_s)))


# #### 5.3.3 Using Topic-Sensitive PageRank
# ###### Integrate topic-sensitive PageRank into a search engine
# 1. Select topics
# 
# 2. Pick a teleport set for each topic, and compute the topic-sensitive PageRank vector.
# 
# 3. Find a way of determining the topic of a particular search query.
# 
# 4. Use the corresponding topic-sensitive PageRank to respond.
# 
# ###### How to  select topics
# + use the top-level topics of the Open Directory.
# 
# + Allow the user to select a topic from a menu.
# 
# + Infer the topics by the words in search query recently.
# 
# + Infer the topics by information about the user.

# #### 5.3.4 Inferring Topics from Words
# **Idea**:     
# Topics are characterized by words that appear surprisingly often in documents on that topic.
# 
# Be **careful**:    
# avoid some extremely rare word (eg: misspell) $gets$ put a floor on the number of times a word appears.
# 
# It is important to pick reasonably large set $S$ to make sure that we cover all aspects of the topic represented by the set.
# 
# We could also blend the topic-sensitive PageRank vectors to reflect user's hybird interests.

# ### 5.4 Link Spam
# **link spam**: the techniques for artificially increasing the PageRank of a page.
# 
# **spam farm**: a collection of pages whose purpose is to increase the PageRank of a certain page or pages.
# 
# #### 5.4.1 Architecture of a Spam Farm

# In[33]:


plt.figure(figsize=(10,10))
plt.imshow(plt.imread('./res/fig_5_16.png'))


# Links from Accessible Pages: comments contained spam link in blog or news sites.

# #### 5.4.2 Analysis of a Spam Farm
# Given: $n$ pages in total, $m$ support pages, $y$ is the PageRank of target page $t$.
# 
# then:     
# the PageRank of each support page:
# $$\frac{\beta y}{m} + \frac{1 - \beta}{n}$$
# 
# support $x$ is the contribute of all accessible pages for target page.    
# we can get the PageRank $p$ of targe page:   
# 
# \begin{align}
#     p &= x + \beta (\frac{\beta y}{m} + \frac{1 - \beta}{n}) m  + \frac{1 - \beta}{n} \\
#       &\approx x + \beta (\frac{\beta y}{m} + \frac{1 - \beta}{n}) m  \\
#       &= x + \beta^2 y + \beta (1-\beta) \frac{m}{n}
# \end{align}
# 
# so we can solve $y$: 
# $$y = \frac{x}{1-\beta^2} + c \frac{m}{n}$$ 
# where $c = \frac{\beta}{1+\beta}$.

# In[34]:


beta = 0.85
x_coe = 1 / (1 - beta**2)
c = beta / (1+beta)

print('y = {} x + {} m/n'.format(x_coe, c))


# $\beta = 0.85$ 时，放大 3.6 倍 $x$，同时吸收了 46% 的 $m/n$。

# #### 5.4.3 Combating Link Spam
# Two Solutions:
# 
# 1. Identify the structure of spam farm.      
#    cons: spammer can modify and develop new spam farm. The war is no end.
#    
# 2. Use infomation (trustyworthy pages) to help judge.      
#    + TrustRank    
#    + Spam mass
#    
# ##### TrustRank
# topic-sensitive PageRank, and the topic is a set of pages believed to be trustworthy (not spam).
# 
# **theory**:     
# while a spam page might easily be mad to link to a trustworthy page, it is unlikely that a trustworthy page would link to a spam page.
# 
# **develop teleport set of trustworthy pages**:    
# exclude blog, news sites
# 
# + Let human examine and decide.     
#   eg: pick the pages of highest PageRank to examine.
#   
# + Pick a domain whose membership is controlled.     
#   eg: .edu, .gov, .edu.cn 
#   
# 
# ##### Spam Mass
# **idea**: we measure for each page that fraction of its PageRank that comes from spam.
# 
# Suppose page $p$ has PageRank $r$ and TrustRank $t$, then the spam mass of $p$ is $$sm(p) = \frac{r-t}{r}$$.
# 
# if $sm(p)$ is close to 1, then $p$ is probably spam.    
# if $sm(p)$ is negative or small positive, then $p$ might not spam. 

# #### 5.4.6 Exercise for Section 5.4
# ###### 5.4.1
# (a)    
# targe page $y$: $y = x + \frac{1-\beta}{n}$      
# support page $z$: 
# 
# \begin{align}
#     z &= \beta(\frac{y}{m} + z) + \frac{1-\beta}{n} \\
#       &= \frac{\beta}{(1-\beta) m} y + \frac{1}{n}
# \end{align}
# 
# (b)     
# targe page $y$: $y = x + \frac{1-\beta}{n}$      
# support page $z$:  $$z = \beta(\frac{y}{m}) + \frac{1-\beta}{n}$$
# 
# (c)    
# targe page $y$: $$y = x + \beta (\frac{z}{2}) m + \frac{1-\beta}{n}$$      
# support page $z$:  $$z = \beta(\frac{y}{m} + \frac{z}{2}) + \frac{1-\beta}{n}$$
# 
# then solve $y$ and $z$. 略
# 
# ###### 5.4.2
# 略
# 
# ###### 5.4.3
# `#todo`

# ### 5.5 Hubs and Authorities (HITS, hyperlink-induced topic search)
# used by the Ask search eigine.
# 
# #### 5.5.1 The Intuition behind HITS
# Two flavors of importances:
# 
# 1. authorities: provide information about a topic.
# 
# 2. hubs: tell you where to go to find out about that topic.
# 
# PageRank: a page is important if important pages link to it.    
# HITS: a page is a good hub if it links to good authorities, and a page is a good authority if it is linked to by good hubs.
# 
# #### Formalizing Hubbiness and Authority
# $h$ - hubbiness vector, $a$ - authority vector.
# 
# In case of growing beyond bounds, we normally scale the values of $h$ and $a$: force either its largest component is 1 or the sum of components is 1.
# 
# **link matrix**: $L_{ij} = 1$ if page $i$ links page $j$.
# 
# We have: $$h = \lambda L a$$
# then: $$a = u L^T h$$
# 
# By substituting:
# \begin{align}
#     h &= \lambda u L L^T h \\
#     a &= \lambda u L^T L a
# \end{align}
# Since $L L^T$ and $L^T L$ are not as sparse as $L$ and $L^T$, for web-sized graphs, the only way of computing the solution is **iteratively**:   
# 1. start with $h$ a vector of all 1's.    
# 2. Compute $a = L^T h$ and then scale so the largest component is 1.      
# 3. Next, compute $h = L a$ and scale again.      
# 4. Repeat step 2-3 until statisfication.

# #### 5.5.3 Exercises for Section 5.5
# 略