#!/usr/bin/env python # coding: utf-8 # # # UW eScience Python Seminar: Fall 2015 # ## *Introduction to Social Network Analysis in Python* # # ### *José Manuel Magallanes Ph.D * # **email: magajm@uw.edu** # # * Senior Data Science Fellow at eScience Institute and Visiting Professor at Evans School of Public Policy and Governance, University of Washington, Seattle # * Professor of Political Science and Public Policy, Pontificia Universidad Católica del Perú, Lima # # Download materials at https://github.com/CoursesAndTutorials/IntroToSNA # Plan for this presentation: # 1. Data formats # 2. Importing Data and Building a Network # 3. Exploring Network, agents and groups # 4. Exporting the Network # ## 1. Data Formats # In this presentation I will use these formats: # # * **Edges List** # * **Adjacency matrix** # * **Adjacency List** # * **Node List** # ## 2. Importing Data # You need to be aware of the format your data has when you need to import it. If you have files already formatted as a network this step is less important. But if you are receiving file from where you need to create the network, the job can be difficult if you are not aware of some simple details shared below. # # 2.1 **Importing Edges List:** # # An Edge list is common way to gather information on a network, its format is shown below: # # # # The steps to get this data into Python, and networkx are shown next: # In[1]: #name and location of file: fileEdges='Data/cosponsorshipEdges.csv' # This reads the CSV file. Not a network yet. import pandas EdgesAsDF=pandas.read_csv(fileEdges) print EdgesAsDF.head() # basic look print EdgesAsDF.shape #dimensions # In[2]: #Here you have a network import networkx as net NWfromEdges=net.from_pandas_dataframe(EdgesAsDF, 'to', 'from',['weight','status']) # Here you visit some edges: NWfromEdges.edges(data=True)[:10] # In[3]: # Here you can visualize your import: import matplotlib.pyplot as plot get_ipython().run_line_magic('matplotlib', 'inline') net.draw(NWfromEdges) # 2.2 **Importing from Adjacency Matrix:** # # This is another possible format: # # # Please use this code in this situation: # In[4]: # Getting the matrix (edges): EdgesAsDF = pandas.read_csv('Data/dataFigueroa.csv', index_col=0) print EdgesAsDF.shape print EdgesAsDF.index #row names # In[5]: EdgesAsDF.columns # In[6]: # The adjacency matrix needs not to include the last column in the data frame. adjacency=EdgesAsDF.ix[:, 0:37] #saving node labels labels=list(adjacency) # Network creation NWfromMatrix = net.Graph(adjacency.values) # Adding labels to nodes (relabelling): NWfromMatrix = net.relabel_nodes(NWfromMatrix, dict(zip(range(37), labels))) net.draw_random(NWfromMatrix) # **2.3 Adjacency List** # # Here is an adjacency list: # # # Importing is in simple steps: # In[7]: # import networkx as net NWfromAdjList=net.read_adjlist("Data/warsAdjlist.csv",delimiter=',') # no pandas this time. net.draw_circular(NWfromAdjList) # **2.4 Node List** # The data from the previous file was original from here: # # There are lists of nodes that can make a network. I present all the steps taken to get it from the wikipedia: # In[9]: ## Informing where the data is: wikiLink='http://en.wikipedia.org/wiki/List_of_border_wars' identification = { 'User-Agent' : 'Mozilla/5.0'} # Hiding who you are? ## Getting the html: from requests import get wikiPage = get(wikiLink, headers=identification) ## BS will interprete the html from bs4 import BeautifulSoup as BS wikiPage = BS(wikiPage.content,'html.parser') ## Let's get all the tables: wikiTables = wikiPage.findAll('table', {'class': 'sortable wikitable'}) # **NEED FOR CLEANING:** # From the previous image, you can see there are many field that will need cleaning: # # # I have create a function (**clean_cell**) to do the cleaning: # In[10]: import re def clean_cell(dirtyCellAsString): cleanedCell=dirtyCellAsString.replace("million", "000000") cleanedCell=cleanedCell.replace(" ", "") cleanedCell=cleanedCell.replace(",", "") cleanedCell=re.sub(r'[0-9]-|[0-9][0-9]-|[0-9][0-9][0-9]-',"", cleanedCell) cleanedCell=re.sub(r'\[[\S ]]',"", cleanedCell) cleanedCell=re.findall(r'\d+',cleanedCell) return cleanedCell # In[11]: # Ready to create the network. # First create empty network, wich I will populate later: countriesGraphFullClean=net.Graph() #This will allow me to make edges when reading set of nodes: from itertools import combinations # Do this for each table for eachTable in wikiTables: # Get all the rows in the table allRows=eachTable.find_all("tr") # Do this for each row, BUT the first (the column names). for eachRow in range(1,len(allRows)): # Get all the cells allCells = allRows[eachRow].find_all("td") # Visit the possibly dirty cells, and clean them that have unclean values and clen nameOfConflict=allCells[2].get_text().replace(u'\u2013', '-') start=clean_cell(allCells[0].string) end=clean_cell(allCells[1].string) fatalities=clean_cell(allCells[4].get_text()) #end - cleaning #computing and exceptions try: duration=float(end[0]) - float(start[0]) except (ValueError,IndexError): duration=1 try: fatalities=float(fatalities[0]) except (ValueError,IndexError): fatalities=1 # Getting the name of the countries in the current conflict in a list countries=[country.get_text() for country in allCells[3].find_all('a')] # Cleaning one case, and rebuilding the list countries=['China' if country=="PRC" or country=="People's Republic of China" else country for country in countries] # Getting the link to eah country webpage linksEachCountry=['http://en.wikipedia.org'+country.get('href') for country in allCells[3].find_all('a')] # Preparing to save the coordinates of the country: coords=[] # For every link collected for link in linksEachCountry: # some cleaning: if link=='http://en.wikipedia.org/wiki/Soviet_Union': link='http://en.wikipedia.org/wiki/Russia' countryPage=BS(get(link).content,'html.parser') # get and unpack the coordinates lati,longi=countryPage.find_all("span",{'class': 'geo'})[0].string.split('; ') # adding the coordinates to list coords.append((float(lati),float(longi))) # pair up countries with its coordinates: countryInfo=zip(countries,coords) #populating graph for country,(lati,longi)in countryInfo: countriesGraphFullClean.add_node(country, latitude=lati,longitude=longi) for pair in combinations (countries,2): countriesGraphFullClean.add_edge(*pair,name=nameOfConflict,duration=duration,fatalities=fatalities) # In[12]: # Plotting the network using the coordinates: X=net.get_node_attributes(countriesGraphFullClean,'longitude') Y=net.get_node_attributes(countriesGraphFullClean,'latitude') posNet=dict((country,(X[country],Y[country])) for country in X if country in Y) net.draw(countriesGraphFullClean,pos=posNet,with_labels=True) # ## 3. Exploration # Using the data on Peruvian elites: # # # I used this data previously, when importing the adjacency matrix. Now it is time to explore it. # ### 3.0 A previous step # In[13]: # The adjacency matrix did not include the nodes attributes. NWfromMatrix.nodes(data=True) # In[14]: # Let's add attributes: EdgesAsDF = pandas.read_csv('Data/dataFigueroa.csv') # Creating a dict, using rows of DF as tuples, to make the merge: attribute= {x:y for (x,y) in EdgesAsDF[['Names','Multinacional']].to_records(index=False)} attribute # network nodes not yet updated # In[15]: for node in NWfromMatrix.nodes(): # merging NWfromMatrix.node[node]['Multinational']=int(attribute[node]) # why int? # Nodes updated! NWfromMatrix.nodes(data=True) # ### 3. 1 Exploring the NETWORK # In[16]: # Is this a connected network? color_map = {1:'r', 0:'g'} colors=[color_map[NWfromMatrix.node[node]['Multinational']] for node in NWfromMatrix] net.draw(NWfromMatrix,node_color=colors) # In[17]: #Improving the visualization with graphviz! pos=net.graphviz_layout(NWfromMatrix, prog='twopi') # twopi, dot, neato, fdp, sfdp, circo plot.figure(figsize=(8,8)) net.draw(NWfromMatrix,pos,node_size=20,node_color=colors, with_labels=True, alpha=0.3) # In[18]: # The network is not connected. Lets get some details: # Components inventory ( in a component, all nodes are reachable) components=[subnet for subnet in net.connected_component_subgraphs(NWfromMatrix)] len(components),[len(sub) for sub in components] # In[19]: #Let's work with the giant component: NWfromMatrix_giant = max(net.connected_component_subgraphs(NWfromMatrix), key=len) # In[20]: # How dense or sparse? net.density(NWfromMatrix_giant) #from 0 to 1 (where 1 makes it a 'complete' nw) # In[21]: #Do nodes tend to connect to nodes similar in degree? net.degree_assortativity_coefficient(NWfromMatrix_giant) #positive tends to assortativeness # In[22]: # random networks have small ShortestPath and small ClusteringCoefficient...Is this the case? net.average_shortest_path_length(NWfromMatrix_giant),net.average_clustering(NWfromMatrix_giant) # The high Clustering coefficient would suggest a **small-world** most (nodes are not neighbors of one another, but most nodes can be reached from every other in few steps). See # In[23]: # How probable is that two business men with a common business friend, are also friends. net.transitivity(NWfromMatrix_giant) # In[24]: # Adapted from # https://networkx.github.io/documentation/latest/examples/drawing/degree_histogram.html degree_sequence=sorted(net.degree(NWfromMatrix_giant).values(),reverse=True) plot.figure(figsize=(12,12)) plot.loglog(degree_sequence,'b-',marker='o') plot.title("Degree rank plot (for Giant Component)") plot.ylabel("degree") ; plot.xlabel("rank") # draw graph in inset plot.axes([0.47,0.47,0.47,0.47]) pos=net.spring_layout(NWfromMatrix_giant) plot.axis('off') net.draw_networkx_nodes(NWfromMatrix_giant,pos,node_size=30,node_color=colors) net.draw_networkx_edges(NWfromMatrix_giant,pos,alpha=0.4) plot.show() # **Do** this NW have *low density*? *low transitivity*? degree distribution *fat-tailed*?: Maybe not a *scale-free* NW... (no *preferential attachment* process behind...) # ### 3.1 Group level # In[25]: #How many cliques? (every two distinct vertices in the clique are adjacent) len(list(net.find_cliques(NWfromMatrix_giant))) # In[26]: #Who there dominant nodes? (nodes to whom the 'others' are adjacent) # * approximation to min nodes. import networkx.algorithms.approximation as netalg dominants=[n for n in netalg.min_weighted_dominating_set(NWfromMatrix_giant)] dominants # In[27]: # plotting dominant nodes pos=net.graphviz_layout(NWfromMatrix_giant, prog='neato') net.draw(NWfromMatrix_giant,pos,node_color='b',node_size=50, alpha=0.2) net.draw_networkx_nodes(NWfromMatrix_giant,pos,nodelist=dominants,node_size=50,node_color='r') plot.show() # In[28]: # Matching Edges? Edges such that no two edges share a common endpoint and # every edge not in the set shares some common endpoint in the set? maximalMatchingEdges=netalg.min_maximal_matching(NWfromMatrix_giant) maximalMatchingEdges # should I apply set(Nodes_maximalMatchingEdges) # In[29]: # Plotting maximalMatchingEdges pos=net.graphviz_layout(NWfromMatrix,prog='circo',args='') plot.figure(figsize=(8,8)) net.draw(NWfromMatrix_giant,pos,node_color='b',node_size=50, alpha=0.2) #nodes Nodes_maximalMatchingEdges=[] for nodes in maximalMatchingEdges: Nodes_maximalMatchingEdges.extend(list(nodes)) net.draw_networkx_nodes(NWfromMatrix_giant,pos,node_color = 'r', node_size = 50, with_labels = False, nodelist=Nodes_maximalMatchingEdges) # edges net.draw_networkx_edges(NWfromMatrix_giant,pos,width=5.0, color='r',edgelist=list(maximalMatchingEdges)) plot.show() # In[30]: # COMMUNITY DETECTION (set of nodes densely connected internally) # based on: https://perso.uclouvain.be/vincent.blondel/research/louvain.html # pip install python-louvain import community parts = community.best_partition(NWfromMatrix_giant) values = [parts.get(node) for node in NWfromMatrix_giant.nodes()] pos=net.graphviz_layout(NWfromMatrix,prog='circo',args='') plot.figure(figsize=(8,8)) plot.axis("off") net.draw_networkx_nodes(NWfromMatrix_giant,pos,cmap = plot.get_cmap("cool"), node_color = values, node_size = 50, with_labels = False) # edges net.draw_networkx_edges(NWfromMatrix_giant,pos,width=1.0,alpha=0.2) plot.show() # ### 3.3 Nodes # In[31]: #Central nodes: degree from operator import itemgetter NodeDegree=sorted(NWfromMatrix_giant.degree().items(), key=itemgetter(1),reverse=True) NodeDegree[:5] # In[32]: # Ego network of Hub? HubNode,HubDegree=NodeDegree[0] HubEgonet=net.ego_graph(NWfromMatrix_giant,HubNode) pos=net.graphviz_layout(HubEgonet,prog='twopi',args='') net.draw(HubEgonet,pos,node_color='b',node_size=800,with_labels=True, alpha=0.5,node_shape='^') net.draw_networkx_nodes(HubEgonet,pos,nodelist=[HubNode],node_size=2000,node_color='r') plot.show() # In[33]: # minimum number of nodes that must be removed to disconnect the network? netalg.node_connectivity(NWfromMatrix_giant) # In[34]: #who can break the network? list(net.articulation_points(NWfromMatrix_giant)) # In[35]: # Ego network of articulation node? pos=net.graphviz_layout(NWfromMatrix_giant,prog='twopi',args='') net.draw(NWfromMatrix_giant,pos,node_color='b',node_size=800,with_labels=True, alpha=0.5,node_shape='^') net.draw_networkx_nodes(NWfromMatrix_giant,pos,nodelist=['Bentin'],node_size=2000,node_color='r') plot.show() # In[36]: # Computing centrality measures: degr=net.degree_centrality(NWfromMatrix_giant) # based on connections count clos=net.closeness_centrality(NWfromMatrix_giant) # "speed" to access the rest betw=net.betweenness_centrality(NWfromMatrix_giant) # "control flow" among the network nodes eige=net.eigenvector_centrality(NWfromMatrix_giant) # central nodes connected to central nodes (influential?) # In[37]: # measures into a data frame: Centrality=[ [rich, degr[rich],clos[rich],betw[rich],eige[rich]] for rich in NWfromMatrix_giant] headers=['Businessman','Degree','Closeness','Betweenness','Eigenvector'] DFCentrality=pandas.DataFrame(Centrality,columns=headers) # In[38]: # plotting thr three values #values = [parts.get(node) for node in NWfromMatrix_giant.nodes()] fig, ax = plot.subplots(figsize=(10,10)) ax.scatter(DFCentrality.Betweenness, DFCentrality.Closeness,s=(DFCentrality.Degree+1.3)**14, c=DFCentrality.Eigenvector, cmap=plot.get_cmap('YlOrRd'), alpha=0.6) for i in range(len(DFCentrality.index)): ax.annotate(DFCentrality['Businessman'][i], (DFCentrality['Betweenness'][i],DFCentrality['Closeness'][i]),alpha=0.5) plot.title("scatterplot (size for degree of node, color for eigenvalue)") plot.xlabel("betweenness") plot.ylabel("closeness") plot.show() # ## 5. Exporting the Network # In[39]: net.write_graphml(NWfromMatrix, "Data/giantElite.graphml",encoding='utf-8') net.write_gexf(NWfromMatrix, "Data/giantElite.gexf",encoding='utf-8') net.write_graphml(countriesGraphFullClean, "Data/WarsGephi.gexf",encoding='utf-8') net.write_gexf(countriesGraphFullClean, "Data/WarsGephi.gexf",encoding='utf-8')