#!/usr/bin/env python # coding: utf-8 #

Social Network Analysis

# # # The goal of this byte is to explore some algorithms and visualizations relating to networks # K-Core Algorithm Introduction: # ============ # * K-Core is an approach of simplifying a graph by removing the edges that have small degrees. The goal of the algorithm is to find groups of nodes that are all connected to at least k other people in the same group # * For more information, you can read this paper http://arxiv.org/pdf/cs/0504107v2.pdf # # Algorithm: # ------------- # 1. Delete all the nodes and corresppoding edges that have degrees less than k # 2. Calculate the degrees of all the remaining nodes. # 3. If the degrees of all the nodes are larger than k or equal to k, return; Otherwise, repeat from step 1 # In[1]: import copy # open the file you have downloaded # these files are organized file = open("amazon.txt") # this returns an array with one entry for each line ni the file lines = file.readlines() print len(lines) # Note: the format of the snap files is to list a node (identified by a unique number) # and all of the nodes it links to (also identified by numbers), on the same line, separated by tabs. # In[2]: # construct the graph # a set is an unordered collection of unique elements edges = set() # this will store our nodes nodes = {} # divide the line into the node and all of its edges # for each line in the file that was loaded in for line in lines: # divide the line into the node and all of its edges data = line.split() a = int(data[0]) b = int(data[1]) # add the edge edges.add((a, b)) # update the count for the number of times we've seen each node nodes[a] = nodes.get(a, -1) + 1 nodes[b] = nodes.get(b, -1) + 1 print "number of unique edges" print len(edges) print "number of unique nodes" print len(nodes) # In[3]: # get the degrees of each node in a set of edges def get_degrees(edges): degree_counts={} # for each pair of nodes (edge) for i,j in edges: # increment the count for the number of edges connected # to each node by one degree_counts[i] = degree_counts.get(i, 0) + 1 degree_counts[j] = degree_counts.get(j, 0) + 1 return degree_counts # Delete all nodes in delete_nodes from edges def delete_node(edges, delete_nodes): # construct a new set of edges new_edges = [] print "# of nodes to be deleted", len(delete_nodes) # loop through all the current edges for i, j in edges: # if an edges two nodes are not in the # set of nodes to be deleted if i not in delete_nodes and j not in delete_nodes: # append that edge to our new edges new_edges.append((i,j)) return new_edges # kcore algorithm # We run the kcore algorithm to delete all # the nodes whose cores are less than k # returns a new set of edges and nodes # including only those in the k core. def kcore(edges, k): # make a complete copy of the edges so we can delete or change # things without messing up our original edges = copy.deepcopy(edges) # now for each pair of nodes, count the number of degree_counts = get_degrees(edges) # sort the nodes by degree and return # only the node numbers (not their degree) sorted_nodes = sorted(degree_counts, key = degree_counts.get) print "largest degree: ", degree_counts[sorted_nodes[0]] # repeatedly delete all nodes with degrees < k to find the k core # if we run out of nodes, or the largest count is < k we should stop while ((len(sorted_nodes) > 0) and (degree_counts[sorted_nodes[0]]