Notebook

In [1]:

import os, sys
import networkx as nx
import numpy as np
import pickle
import pandas as pd
sys.path.append('../');
import Holes as ho
import matplotlib.pyplot as plt

In [2]:

%matplotlib inline

Maximal homology dimension $k$ to calculate¶

In [3]:

max_dimensions=1;

Load generators dictionaries¶

In [4]:

import scipy.io
data_dir='../stores/phom/country_high_act_low_entropy_matrices/'
ld = os.listdir(data_dir);
threshold_factor=0.1;
thr=2.7
call_thresholds=[thr] 
nsigma_round = [-3.4, -3.0, -2.7, -2.3, -1.9, -1.6, -1.2, -0.9, -0.5, -0.1]

countries_to_cells={}
    
weighted_gen_dict={}
for country in ld:
    if country == '.gitignore':
            continue
    country = country.split('.')[0]
    weighted_gen_dict[country]={}
    for call_thr in call_thresholds:
        try:
            filename=open('../stores/phom/output/gen/weighted_generators_'+country+'_.pck');
            weighted_gen_dict[country][call_thr]=pickle.load(filename);
            filename.close()
        except:
            print 'didnt work with', country, call_thr

didnt work with 14413 2.7
didnt work with 14415 2.7
didnt work with 14417 2.7
didnt work with 88213 2.7
didnt work with 88233 2.7
didnt work with 88235 2.7
didnt work with 88239 2.7
didnt work with 246 2.7
didnt work with 247 2.7
didnt work with 18686 2.7
didnt work with 258 2.7
didnt work with 262 2.7
didnt work with 263 2.7
didnt work with 264 2.7
didnt work with 265 2.7
didnt work with 267 2.7
didnt work with 290 2.7
didnt work with 297 2.7
didnt work with 298 2.7
didnt work with 299 2.7
didnt work with 18696 2.7
didnt work with 18765 2.7
didnt work with 18767 2.7
didnt work with 18768 2.7
didnt work with 18769 2.7
didnt work with 379 2.7
didnt work with 12684 2.7
didnt work with 1784 2.7
didnt work with 12843 2.7
didnt work with 12844 2.7
didnt work with 12845 2.7
didnt work with 8816 2.7
didnt work with 8818 2.7
didnt work with 8819 2.7
didnt work with 674 2.7
didnt work with 680 2.7
didnt work with 681 2.7
didnt work with 682 2.7
didnt work with 683 2.7
didnt work with 685 2.7
didnt work with 687 2.7
didnt work with 689 2.7
didnt work with 691 2.7
didnt work with 881 2.7
didnt work with 97222 2.7
didnt work with 68572 2.7
didnt work with 68577 2.7
didnt work with 97256 2.7
didnt work with 29773 2.7
didnt work with 29774 2.7
didnt work with 1129 2.7
didnt work with 1139 2.7
didnt work with 1226 2.7
didnt work with 1235 2.7
didnt work with 1242 2.7
didnt work with 1246 2.7
didnt work with 1249 2.7
didnt work with 1264 2.7
didnt work with 1579 2.7
didnt work with 1284 2.7
didnt work with 17676 2.7
didnt work with 1581 2.7
didnt work with 1340 2.7
didnt work with 1365 2.7
didnt work with 1441 2.7
didnt work with 1450 2.7
didnt work with 1506 2.7
didnt work with 18683 2.7
didnt work with 18684 2.7
didnt work with 7711 2.7
didnt work with 7713 2.7
didnt work with 7714 2.7
didnt work with 7715 2.7
didnt work with 7716 2.7
didnt work with 7717 2.7
didnt work with 7718 2.7
didnt work with 7721 2.7
didnt work with 7722 2.7
didnt work with 7723 2.7
didnt work with 7725 2.7
didnt work with 7726 2.7
didnt work with 7760 2.7
didnt work with 7761 2.7
didnt work with 7762 2.7
didnt work with 7775 2.7
didnt work with 7776 2.7
didnt work with 1641 2.7
didnt work with 1645 2.7
didnt work with 1649 2.7
didnt work with 1664 2.7
didnt work with 1671 2.7
didnt work with 1684 2.7
didnt work with 1709 2.7
didnt work with 1758 2.7
didnt work with 50931 2.7
didnt work with 50936 2.7
didnt work with 50938 2.7
didnt work with 50947 2.7
didnt work with 50948 2.7
didnt work with 1819 2.7
didnt work with 1867 2.7
didnt work with 1902 2.7
didnt work with 1907 2.7
didnt work with 1924 2.7
didnt work with 1927 2.7
didnt work with 1929 2.7
didnt work with 1930 2.7
didnt work with 1938 2.7
didnt work with 1939 2.7

Construct spatial coherency and cyclity from generators¶

In [5]:

wmax_H_1_persistent = {}

for c in weighted_gen_dict:
    wmax_H_1_persistent[c] = {}
    for call_thr in weighted_gen_dict[c]:
        if len(weighted_gen_dict[c][call_thr][1])>0:
            wmax_H_1_persistent[c][call_thr] = np.max(map(lambda x:  x.persistence_interval(), weighted_gen_dict[c][call_thr][1]))
        else:
            wmax_H_1_persistent[c][call_thr]=0;

In [6]:

wH_0_persistent = {}
wH_0_persistent_mod = {}
for c in weighted_gen_dict:
    wH_0_persistent[c] = {}
    wH_0_persistent_mod[c] = {}
    for call_thr in weighted_gen_dict[c]:
        if len(weighted_gen_dict[c][call_thr][0])>1:
            wH_0_persistent[c][call_thr] = (sorted(map(lambda x:  x.persistence_interval(), weighted_gen_dict[c][call_thr][0])))
            wH_0_persistent_mod[c][call_thr] = (wH_0_persistent[c][call_thr][-1]-wH_0_persistent[c][call_thr][-2])/float(wH_0_persistent[c][call_thr][-1])
        else:
            wH_0_persistent[c][call_thr]=[1];
            wH_0_persistent_mod[c][call_thr]=[1];

In [7]:

coords_matrix={}
country_order = {}
for i,call_thr in enumerate(sorted(call_thresholds)):
    coords_matrix[call_thr]=[]
    country_order [call_thr] = []
    for c in wH_0_persistent_mod.keys():
        if call_thr in wH_0_persistent_mod[c]:
            coords_matrix[call_thr].append([ wH_0_persistent_mod[c][call_thr], wmax_H_1_persistent[c][call_thr]/float(max(wH_0_persistent[c][call_thr]))]); 
            country_order[call_thr].append(c);

In [8]:

X={}
for thr in sorted(coords_matrix.keys()):
    X[thr]=np.zeros((int(len(coords_matrix[thr])),2));
    print X[thr].shape
    for i, pair in enumerate(coords_matrix[thr]):
        if plt.is_numlike(pair[0]):
            pair = [pair[0], pair[1]]
            X[thr][i]=pair[0];
            X[thr][i][1]=pair[1];
        else:
            pair = [pair[0][0], pair[1]]
            X[thr][i][0]=pair[0];
            X[thr][i][1]=pair[1];

(247, 2)

K-means¶

In [9]:

import time as time
import numpy as np
import pylab as pl
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn import cluster, datasets

k_means = cluster.KMeans(n_clusters=2, n_init=1)
k_means.fit(X[2.7]) 

values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_
fig = plt.figure()
ax = fig.add_subplot(111)
for l in np.unique(labels):
    ax.plot(X[2.7][labels == l, 0], X[2.7][labels == l, 1],
              'o', color=pl.cm.jet(np.float(l) / np.max(labels + 1)))
plt.title('Without connectivity constraints')

Out[9]:

<matplotlib.text.Text at 0x7f6a7cacb810>

Import population data¶

In [10]:

import pandas as pd
remitt=pd.read_csv('../data/phom/population_data.csv');

In [11]:

short_country_order = {}
for thr in country_order:
    short_country_order[thr]=[]
    for c in country_order[thr]:
        try:
            short_country_order[thr].append(remitt[remitt['prefix']==int(c)]['country_cca'].values[0])
        except:
            pass

In [12]:

X_filter = pd.Series(short_country_order[thr]).str.match('[A-Z]+').values

In [13]:

country_cluster_assignment={}
k_means = cluster.KMeans(n_clusters=2, n_init=1)
k_means.fit(X[thr][X_filter]);
labels = k_means.labels_;
country_cluster_assignment[thr] = zip(short_country_order[thr], labels)

In [14]:

fig = plt.figure(figsize=(21,7))
ax1 = plt.subplot2grid((1,9), (0,0), colspan=4)

thr=2.7

from sklearn import cluster, datasets
k_means = cluster.KMeans(n_clusters=2, n_init=1)
k_means.fit(X[thr][X_filter]) 
values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_

col = ['r','k']

for l in np.unique(labels):
#    ax1.plot(X[thr][labels == l, 0], X[thr][labels == l, 1],
 #             'o', color=pl.cm.jet(np.float(l) / np.max(labels + 1)),markersize=10)
    ax1.plot(X[thr][X_filter][labels == l, 0], X[thr][X_filter][labels == l, 1],
              '.', color=col[l],markersize=13)
for i, c in enumerate(X[thr][X_filter]):
    plt.text(c[0], c[1], short_country_order[thr][i], fontsize=14)

xtickNames = plt.setp(ax1, xticklabels=np.linspace(0,1,6))
plt.setp(xtickNames, fontsize=23)
ytickNames = plt.setp(ax1, yticklabels=np.linspace(-0.05,.2,6))
plt.setp(ytickNames, fontsize=23)
ax1.set_ylabel(r'$r_1$',fontsize=28)
ax1.set_xlabel(r'$r_0$',fontsize=28)
plt.text(0.02,0.186,'(a)',fontsize=23)
plt.ylim(-0.02,0.2)
plt.xlim(0,1.05)

#plt.tight_layout()
    
ax2 = plt.subplot2grid((1,9), (0,4), colspan=2)

group1=[]
group2=[]
for c_tuple in country_cluster_assignment[thr]:
    if c_tuple[1]==0:
        group1.append(c_tuple[0]);
    else:
        group2.append(c_tuple[0]);

GDP_group1=[]
GDP_group2=[]
for c in group1:
    try:
        GDP_group1.append(remitt['GDPpercapita'][remitt['country_cca']==c].values[0])
    except:
        print 'Country not in dataset :', c
for c in group2:
    try:
        GDP_group2.append(remitt['GDPpercapita'][remitt['country_cca']==c].values[0])
    except:
        print 'Country not in dataset :', c

data = [(GDP_group1), (GDP_group2)]
numDists=2
#plt.subplots_adjust(left=0.095, right=0.95, top=0.9, bottom=0.25)
bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')

# Add a horizontal grid to the plot, but make it very light in color
# so we can use it for reading data values but not be distracting
ax2.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
              alpha=0.5)

# Hide these grid behind plot objects
ax2.set_axisbelow(True)
#ax1.set_title('Comparison of IID Bootstrap Resampling Across Five Distributions')
ax2.set_ylabel(r'Avg GDP per capita ($10^3$USD)',fontsize=23)

# Now fill the boxes with desired colors
boxColors = ['red','black']
numBoxes = numDists
medians = range(numBoxes)
for i in range(numBoxes):
  box = bp['boxes'][i]
  boxX = []
  boxY = []
  for j in range(5):
      boxX.append(box.get_xdata()[j])
      boxY.append(box.get_ydata()[j])
  boxCoords = zip(boxX,boxY)
  # Alternate between Dark Khaki and Royal Blue
  k = i % 2
  boxPolygon = plt.Polygon(boxCoords, facecolor=boxColors[k], alpha=0.7)
  ax2.add_patch(boxPolygon)
  # Now draw the median lines back over what we just filled in
  med = bp['medians'][i]
  medianX = []
  medianY = []
  for j in range(2):
      medianX.append(med.get_xdata()[j])
      medianY.append(med.get_ydata()[j])
      plt.plot(medianX, medianY, 'k')
      medians[i] = medianY[0]
  # Finally, overplot the sample averages, with horizontal alignment
  # in the center of each box
  plt.plot([np.average(med.get_xdata())], [np.average(data[i])],
           color='w', marker='*', markeredgecolor='k')

# Set the axes ranges and axes labels
ax2.set_xlim(0.5, numBoxes+0.5)
xtickNames = plt.setp(ax2, xticklabels=['Cluster 1','Cluster 2'])
plt.setp(xtickNames, fontsize=23)
ytickNames = plt.setp(ax2, yticklabels=range(0,100,10))
plt.setp(ytickNames, fontsize=23)
plt.text(0.6,75000,'(b)',fontsize=23)

ax3 = plt.subplot2grid((1,9), (0,6), colspan=2)
GDP_group1=[]
GDP_group2=[]
for c in group1:
    try:
        GDP_group1.append(remitt['AvgMIRemittance'][remitt['country_cca']==c].values[0])
    except:
        print 'Country not in dataset :', c
for c in group2:
    try:
        GDP_group2.append(remitt['AvgMIRemittance'][remitt['country_cca']==c].values[0])
    except:
        print 'Country not in dataset :', c

data = [GDP_group1, (GDP_group2)]
numDists=2
#plt.subplots_adjust(left=0.095, right=0.95, top=0.9, bottom=0.25)
bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')

# Add a horizontal grid to the plot, but make it very light in color
# so we can use it for reading data values but not be distracting
ax3.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
              alpha=0.5)
# Hide these grid behind plot objects
ax3.set_axisbelow(True)
#ax1.set_title('Comparison of IID Bootstrap Resampling Across Five Distributions')
ax3.set_ylabel('Avg remittance fraction per capita (%)',fontsize=23)

# Now fill the boxes with desired colors
boxColors = ['red','black']
numBoxes = numDists
medians = range(numBoxes)
for i in range(numBoxes):
  box = bp['boxes'][i]
  boxX = []
  boxY = []
  for j in range(5):
      boxX.append(box.get_xdata()[j])
      boxY.append(box.get_ydata()[j])
  boxCoords = zip(boxX,boxY)
  # Alternate between Dark Khaki and Royal Blue
  k = i % 2
  boxPolygon = plt.Polygon(boxCoords, facecolor=boxColors[k], alpha=0.7)
  ax3.add_patch(boxPolygon)
  # Now draw the median lines back over what we just filled in
  med = bp['medians'][i]
  medianX = []
  medianY = []
  for j in range(2):
      medianX.append(med.get_xdata()[j])
      medianY.append(med.get_ydata()[j])
      ax3.plot(medianX, medianY, 'k')
      medians[i] = medianY[0]
  # Finally, overplot the sample averages, with horizontal alignment
  # in the center of each box
  plt.plot([np.average(med.get_xdata())], [np.average(data[i])],
           color='w', marker='*', markeredgecolor='k')

# Set the axes ranges and axes labels
ax3.set_xlim(0.5, numBoxes+0.5)
top = 100
bottom = -5
ax3.set_ylim(0,50)
xtickNames = plt.setp(ax3, xticklabels=['Cluster 1','Cluster 2'])
plt.setp(xtickNames, fontsize=23)
ytickNames = plt.setp(ax3, yticklabels=range(0,60,10))
plt.setp(ytickNames, fontsize=23)

plt.text(0.6,47,'(c)',fontsize=23)
plt.tight_layout()
plt.show()