!ls -l ../datasets/segmentationOriginal/

import numpy as np
import pandas as pd

cell_segmentation = pd.read_csv("../datasets/segmentationOriginal/segmentationOriginal.csv")

cell_segmentation.shape

cell_segmentation.head(5)

cell_segmentation.groupby('Case').count()

# separate training and test data
cell_train = cell_segmentation.ix[cell_segmentation['Case'] == 'Train']
cell_test = cell_segmentation.ix[cell_segmentation['Case'] == 'Test']

cell_train.head(5)

%matplotlib inline
import matplotlib.pyplot as plt

# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3)

ax1.hist(cell_train['VarIntenCh3'].values, bins=20)
ax1.set_xlabel('Natural Units')
ax1.set_ylabel('Count')

ax2.hist(np.log(cell_train['VarIntenCh3'].values), bins=20)
ax2.set_xlabel('Log Units')

ax3.hist(np.sqrt(cell_train['VarIntenCh3'].values), bins=20)
ax3.set_xlabel('Square Root Units')

from scipy.stats import skew

r = np.max(cell_train['VarIntenCh3'].values)/np.min(cell_train['VarIntenCh3'].values)
skewness = skew(cell_train['VarIntenCh3'].values)

print 'Ratio of the smallest to largest value is {0} \nSample skewness statistic is {1}'.format(r, skewness)

from scipy.stats import boxcox

print 'Estimated lambda is {0}'.format(boxcox(cell_train['VarIntenCh3'].values)[1])

fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.hist(cell_train['PerimCh1'].values, bins=20)
ax1.set_xlabel('Natural Units')
ax1.set_ylabel('Count')

ax2.hist(boxcox(cell_train['PerimCh1'].values)[0], bins=20)
ax2.set_xlabel('Transformed Data (lambda = {:1.4f})'.format(boxcox(cell_train['PerimCh1'].values)[1]))

# toy example
beta0 = -2.3 # intercept
beta1 = 0.8 # slope
n = 1000
x1_true = np.random.normal(4, 2, n)
x2_true = np.zeros(n)

# generate a random sample
for i in xrange(n):
    x2_true[i] = beta0 + beta1*x1_true[i] + np.random.normal(size = 1)
    
# generate outliers
x1_outliers = np.random.uniform(-4, -3, 8)
x2_outliers = np.zeros(8)
for i in xrange(8):
    x2_outliers[i] = x1_outliers[i] + np.random.normal(size = 1)

plt.scatter(x1_true, x2_true)
plt.plot(x1_outliers, x2_outliers, 'ro', markersize=8)

from sklearn.preprocessing import scale
x1 = scale(np.concatenate([x1_true, x1_outliers]))
x2 = scale(np.concatenate([x2_true, x2_outliers]))
x = np.array(zip(x1, x2))

# spatial sign
dist = x[:, 0]**2 + x[:, 1]**2
x1 = x[:, 0]/np.sqrt(dist)
x2 = x[:, 1]/np.sqrt(dist)

plt.scatter(x1[:-8], x2[:-8])
plt.plot(x1[-7:], x2[-7:], 'ro', markersize=8)

cell_train_subset = cell_train[['Class', 'FiberWidthCh1', 'EntropyIntenCh1']]

colors = ['b', 'r']
markers = ['s', 'o']
c = ['PS', 'WS']
for k, m in enumerate(colors):
    i = (cell_train_subset['Class'] == c[k])
    if k == 0:
        plt.scatter(cell_train_subset['FiberWidthCh1'][i], cell_train_subset['EntropyIntenCh1'][i], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='PS')
    else:
        plt.scatter(cell_train_subset['FiberWidthCh1'][i], cell_train_subset['EntropyIntenCh1'][i], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='WS')

plt.title('Original Data')
plt.xlabel('Channel 1 Fiber Width')
plt.ylabel('Entropy intensity of Channel 1')
plt.legend(loc='upper right')
plt.show()

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(cell_train_subset[['FiberWidthCh1', 'EntropyIntenCh1']])
print 'variance explained by PCs {0}'.format(pca.explained_variance_ratio_)

cell_train_subset_pca = pca.transform(cell_train_subset[['FiberWidthCh1', 'EntropyIntenCh1']])

colors = ['b', 'r']
markers = ['s', 'o']
c = ['PS', 'WS']
for k, m in enumerate(colors):
    i = np.where(cell_train_subset['Class'] == c[k])[0]
    if k == 0:
        plt.scatter(cell_train_subset_pca[i, 0], cell_train_subset_pca[i, 1], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='PS')
    else:
        plt.scatter(cell_train_subset_pca[i, 0], cell_train_subset_pca[i, 1], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='WS')

plt.title('Transformed')
plt.xlabel('Principal Component #1')
plt.ylabel('Principal Component #2')
plt.legend(loc='upper right')
plt.show()

cell_train.head(5)

cell_train_feature = cell_train.iloc[:, 4:]
cell_train_feature.head(5)

# Box-Cox transformation on positive predictors
# separate positive and non-positive predictors
pos_indx = np.where(cell_train_feature.apply(lambda x: np.all(x > 0)))[0]
cell_train_feature_pos = cell_train_feature.iloc[:, pos_indx]
print "# of positive features is {0}".format(pos_indx.shape[0])
cell_train_feature_nonpos = cell_train_feature.drop(cell_train_feature.columns[pos_indx], axis=1, inplace=False)
print "# of npn-positive features is {0}".format(cell_train_feature.shape[1] - pos_indx.shape[0])

cell_train_feature_pos_tr = cell_train_feature_pos.apply(lambda x: boxcox(x)[0])

cell_train_feature_tr = np.c_[cell_train_feature_pos_tr, cell_train_feature_nonpos]
print "The shape before/after transformation is {0} and {1}".format(cell_train_feature.shape, cell_train_feature_tr.shape)

# scale and center predictors
from sklearn.preprocessing import scale

cell_train_feature_tr = scale(cell_train_feature_tr, with_mean=True, with_std=True)

# conduct PCA to transformed predictors
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(cell_train_feature_tr)

# generate scree plot
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Percent of Total Variance')
plt.ylabel('Component')

print "The first four components account for {0} of the total variance".format(pca.explained_variance_ratio_[:4])
print "All together they account for {0} of the total variance".format(np.sum(pca.explained_variance_ratio_[:4]))

# look at the first 3 PCs
pca = PCA(n_components=3)
cell_train_feature_pca = pca.fit_transform(cell_train_feature_tr)

colors = ['b', 'r']
markers = ['s', 'o']
c = ['PS', 'WS']

fig, axarr = plt.subplots(3, 3, sharex=True, sharey=True)

# PC1 vs PC3
for k, m in enumerate(colors):
    i = np.where(cell_train['Class'] == c[k])[0]
    if k == 0:
        line1= axarr[0,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 2], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='PS')
    else:
        line2= axarr[0,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 2], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='WS')

# PC2 vs PC3
for k, m in enumerate(colors):
    i = np.where(cell_train['Class'] == c[k])[0]
    if k == 0:
        axarr[0,1].scatter(cell_train_feature_pca[i, 1], cell_train_feature_pca[i, 2], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='PS')
    else:
        axarr[0,1].scatter(cell_train_feature_pca[i, 1], cell_train_feature_pca[i, 2], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='WS')

# PC1 vs PC2
for k, m in enumerate(colors):
    i = np.where(cell_train['Class'] == c[k])[0]
    if k == 0:
        axarr[1,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 1], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='PS')
    else:
        axarr[1,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 1], 
                    c=m, marker=markers[k], alpha=0.4, s=26, label='WS')

axarr[2,0].text(0.5, -1.0, 'PC1', ha='center', va='center', fontsize=24)      
axarr[1,1].text(0.5, -1.0, 'PC2', ha='center', va='center', fontsize=24)      
axarr[0,2].text(0.5, -1.0, 'PC3', ha='center', va='center', fontsize=24)
fig.legend([line1, line2], ('PS', 'WS'), loc='upper center', ncol=2, frameon=False)
fig.subplots_adjust(hspace=0.12, wspace=0.1)
fig.text(0.5, 0.06, 'Scatter Plot Matrix', ha='center', va='center', fontsize=18)

# loadings
pca.components_.shape

# randomly sample 50 test set
import random
cell_test_subset = cell_test.iloc[np.sort(random.sample(range(cell_test.shape[0]), 50))]

# separate features
cell_test_subset_f = cell_test_subset.iloc[:, 4:].drop('VarIntenCh3', 1)
cell_test_subset_v = cell_test_subset.iloc[:, 4:]['VarIntenCh3']
cell_train_f = cell_train_feature.drop('VarIntenCh3', 1)
cell_train_v = cell_train_feature['VarIntenCh3']

# scale and center before imputation
from sklearn.preprocessing import StandardScaler

# standardize based on training set
sc_f = StandardScaler()
cell_train_f_sc = sc_f.fit_transform(cell_train_f)
cell_test_subset_f_sc = sc_f.transform(cell_test_subset_f)

sc_v = StandardScaler()
cell_train_v_sc = sc_v.fit_transform(cell_train_v)
cell_test_subset_v_sc = sc_v.transform(cell_test_subset_v)

# use 5-nearest neighbor
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors = 5)
nbrs.fit(cell_train_f_sc) # based on training set
distance, indices = nbrs.kneighbors(cell_test_subset_f_sc) # neighbors for test set

# imputation
cell_test_subset_v_pred_knn = np.empty(50)
for idx, i in enumerate(indices):
    cell_test_subset_v_pred_knn[idx] = np.mean(cell_train_v_sc[i[1:]])

from scipy.stats.stats import pearsonr

print "corr('VarIntenCh3', 'DiffIntenDensityCh3') is {0}".format(pearsonr(cell_train_v, cell_train_f['DiffIntenDensityCh3'])[0])

# use linear model
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(cell_train_f_sc[:, cell_train_f.columns.get_loc('DiffIntenDensityCh3')][:, np.newaxis],
       cell_train_v_sc[:, np.newaxis]) # find the predictor with highest correlation
cell_test_subset_v_pred_lm = \
lm.predict(cell_test_subset_f_sc[:, cell_train_f.columns.get_loc('DiffIntenDensityCh3')][:, np.newaxis])

print "kNN: {0}".format(pearsonr(cell_test_subset_v_sc, cell_test_subset_v_pred_knn)[0])
print "Linear Model: {0}".format(pearsonr(cell_test_subset_v_sc[:, np.newaxis], cell_test_subset_v_pred_lm)[0][0])

fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(cell_test_subset_v_sc, cell_test_subset_v_pred_knn)
ax1.set(xlim=(-1.5, 3), ylim=(-1.5, 3))
ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3")
ax1.set_title('5NN')

ax2.scatter(cell_test_subset_v_sc, cell_test_subset_v_pred_lm)
ax2.set(xlim=(-1.5, 3), ylim=(-1.5, 3))
ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls="--", c=".3")
ax2.set_title('Linear Model')

fig.text(0.5, 0.04, 'Original Value (centered and scaled)', ha='center', va='center')
fig.text(0.06, 0.5, 'Imputed', ha='center', va='center', rotation='vertical')

# calculate the correlation matrix
corr_dataframe = cell_train_feature.corr()

# compute hierarchical cluster on both rows and columns for correlation matrix and plot heatmap 
def corr_heatmap(corr_dataframe):
    import scipy.cluster.hierarchy as sch
    
    corr_matrix = np.array(corr_dataframe)
    col_names = corr_dataframe.columns
    
    Y = sch.linkage(corr_matrix, 'single', 'correlation')
    Z = sch.dendrogram(Y, color_threshold=0, no_plot=True)['leaves']
    corr_matrix = corr_matrix[Z, :]
    corr_matrix = corr_matrix[:, Z]
    col_names = col_names[Z]
    im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto', cmap='bwr')
    plt.colorbar()
    plt.xticks(range(corr_matrix.shape[0]), col_names, rotation='vertical', fontsize=4)
    plt.yticks(range(corr_matrix.shape[0]), col_names[::-1], fontsize=4)
    
# plot
corr_heatmap(corr_dataframe)

!ls -l ../datasets/GermanCredit/

credit_data = pd.read_csv("../datasets/GermanCredit/GermanCredit.csv")
credit_data.head(5)

credit_data.shape

credit_data_saving = credit_data[['SavingsAccountBonds.lt.100', 'SavingsAccountBonds.100.to.500', 
                                  'SavingsAccountBonds.500.to.1000', 'SavingsAccountBonds.gt.1000', 
                                  'SavingsAccountBonds.Unknown']]
credit_data_saving.head(10)

credit_data_saving.apply(np.sum)