!ls -l ../datasets/segmentationOriginal/ import numpy as np import pandas as pd cell_segmentation = pd.read_csv("../datasets/segmentationOriginal/segmentationOriginal.csv") cell_segmentation.shape cell_segmentation.head(5) cell_segmentation.groupby('Case').count() # separate training and test data cell_train = cell_segmentation.ix[cell_segmentation['Case'] == 'Train'] cell_test = cell_segmentation.ix[cell_segmentation['Case'] == 'Test'] cell_train.head(5) %matplotlib inline import matplotlib.pyplot as plt # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() fig, (ax1, ax2, ax3) = plt.subplots(1, 3) ax1.hist(cell_train['VarIntenCh3'].values, bins=20) ax1.set_xlabel('Natural Units') ax1.set_ylabel('Count') ax2.hist(np.log(cell_train['VarIntenCh3'].values), bins=20) ax2.set_xlabel('Log Units') ax3.hist(np.sqrt(cell_train['VarIntenCh3'].values), bins=20) ax3.set_xlabel('Square Root Units') from scipy.stats import skew r = np.max(cell_train['VarIntenCh3'].values)/np.min(cell_train['VarIntenCh3'].values) skewness = skew(cell_train['VarIntenCh3'].values) print 'Ratio of the smallest to largest value is {0} \nSample skewness statistic is {1}'.format(r, skewness) from scipy.stats import boxcox print 'Estimated lambda is {0}'.format(boxcox(cell_train['VarIntenCh3'].values)[1]) fig, (ax1, ax2) = plt.subplots(1, 2) ax1.hist(cell_train['PerimCh1'].values, bins=20) ax1.set_xlabel('Natural Units') ax1.set_ylabel('Count') ax2.hist(boxcox(cell_train['PerimCh1'].values)[0], bins=20) ax2.set_xlabel('Transformed Data (lambda = {:1.4f})'.format(boxcox(cell_train['PerimCh1'].values)[1])) # toy example beta0 = -2.3 # intercept beta1 = 0.8 # slope n = 1000 x1_true = np.random.normal(4, 2, n) x2_true = np.zeros(n) # generate a random sample for i in xrange(n): x2_true[i] = beta0 + beta1*x1_true[i] + np.random.normal(size = 1) # generate outliers x1_outliers = np.random.uniform(-4, -3, 8) x2_outliers = np.zeros(8) for i in xrange(8): x2_outliers[i] = x1_outliers[i] + np.random.normal(size = 1) plt.scatter(x1_true, x2_true) plt.plot(x1_outliers, x2_outliers, 'ro', markersize=8) from sklearn.preprocessing import scale x1 = scale(np.concatenate([x1_true, x1_outliers])) x2 = scale(np.concatenate([x2_true, x2_outliers])) x = np.array(zip(x1, x2)) # spatial sign dist = x[:, 0]**2 + x[:, 1]**2 x1 = x[:, 0]/np.sqrt(dist) x2 = x[:, 1]/np.sqrt(dist) plt.scatter(x1[:-8], x2[:-8]) plt.plot(x1[-7:], x2[-7:], 'ro', markersize=8) cell_train_subset = cell_train[['Class', 'FiberWidthCh1', 'EntropyIntenCh1']] colors = ['b', 'r'] markers = ['s', 'o'] c = ['PS', 'WS'] for k, m in enumerate(colors): i = (cell_train_subset['Class'] == c[k]) if k == 0: plt.scatter(cell_train_subset['FiberWidthCh1'][i], cell_train_subset['EntropyIntenCh1'][i], c=m, marker=markers[k], alpha=0.4, s=26, label='PS') else: plt.scatter(cell_train_subset['FiberWidthCh1'][i], cell_train_subset['EntropyIntenCh1'][i], c=m, marker=markers[k], alpha=0.4, s=26, label='WS') plt.title('Original Data') plt.xlabel('Channel 1 Fiber Width') plt.ylabel('Entropy intensity of Channel 1') plt.legend(loc='upper right') plt.show() from sklearn.decomposition import PCA pca = PCA() pca.fit(cell_train_subset[['FiberWidthCh1', 'EntropyIntenCh1']]) print 'variance explained by PCs {0}'.format(pca.explained_variance_ratio_) cell_train_subset_pca = pca.transform(cell_train_subset[['FiberWidthCh1', 'EntropyIntenCh1']]) colors = ['b', 'r'] markers = ['s', 'o'] c = ['PS', 'WS'] for k, m in enumerate(colors): i = np.where(cell_train_subset['Class'] == c[k])[0] if k == 0: plt.scatter(cell_train_subset_pca[i, 0], cell_train_subset_pca[i, 1], c=m, marker=markers[k], alpha=0.4, s=26, label='PS') else: plt.scatter(cell_train_subset_pca[i, 0], cell_train_subset_pca[i, 1], c=m, marker=markers[k], alpha=0.4, s=26, label='WS') plt.title('Transformed') plt.xlabel('Principal Component #1') plt.ylabel('Principal Component #2') plt.legend(loc='upper right') plt.show() cell_train.head(5) cell_train_feature = cell_train.iloc[:, 4:] cell_train_feature.head(5) # Box-Cox transformation on positive predictors # separate positive and non-positive predictors pos_indx = np.where(cell_train_feature.apply(lambda x: np.all(x > 0)))[0] cell_train_feature_pos = cell_train_feature.iloc[:, pos_indx] print "# of positive features is {0}".format(pos_indx.shape[0]) cell_train_feature_nonpos = cell_train_feature.drop(cell_train_feature.columns[pos_indx], axis=1, inplace=False) print "# of npn-positive features is {0}".format(cell_train_feature.shape[1] - pos_indx.shape[0]) cell_train_feature_pos_tr = cell_train_feature_pos.apply(lambda x: boxcox(x)[0]) cell_train_feature_tr = np.c_[cell_train_feature_pos_tr, cell_train_feature_nonpos] print "The shape before/after transformation is {0} and {1}".format(cell_train_feature.shape, cell_train_feature_tr.shape) # scale and center predictors from sklearn.preprocessing import scale cell_train_feature_tr = scale(cell_train_feature_tr, with_mean=True, with_std=True) # conduct PCA to transformed predictors from sklearn.decomposition import PCA pca = PCA() pca.fit(cell_train_feature_tr) # generate scree plot plt.plot(pca.explained_variance_ratio_) plt.xlabel('Percent of Total Variance') plt.ylabel('Component') print "The first four components account for {0} of the total variance".format(pca.explained_variance_ratio_[:4]) print "All together they account for {0} of the total variance".format(np.sum(pca.explained_variance_ratio_[:4])) # look at the first 3 PCs pca = PCA(n_components=3) cell_train_feature_pca = pca.fit_transform(cell_train_feature_tr) colors = ['b', 'r'] markers = ['s', 'o'] c = ['PS', 'WS'] fig, axarr = plt.subplots(3, 3, sharex=True, sharey=True) # PC1 vs PC3 for k, m in enumerate(colors): i = np.where(cell_train['Class'] == c[k])[0] if k == 0: line1= axarr[0,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 2], c=m, marker=markers[k], alpha=0.4, s=26, label='PS') else: line2= axarr[0,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 2], c=m, marker=markers[k], alpha=0.4, s=26, label='WS') # PC2 vs PC3 for k, m in enumerate(colors): i = np.where(cell_train['Class'] == c[k])[0] if k == 0: axarr[0,1].scatter(cell_train_feature_pca[i, 1], cell_train_feature_pca[i, 2], c=m, marker=markers[k], alpha=0.4, s=26, label='PS') else: axarr[0,1].scatter(cell_train_feature_pca[i, 1], cell_train_feature_pca[i, 2], c=m, marker=markers[k], alpha=0.4, s=26, label='WS') # PC1 vs PC2 for k, m in enumerate(colors): i = np.where(cell_train['Class'] == c[k])[0] if k == 0: axarr[1,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 1], c=m, marker=markers[k], alpha=0.4, s=26, label='PS') else: axarr[1,0].scatter(cell_train_feature_pca[i, 0], cell_train_feature_pca[i, 1], c=m, marker=markers[k], alpha=0.4, s=26, label='WS') axarr[2,0].text(0.5, -1.0, 'PC1', ha='center', va='center', fontsize=24) axarr[1,1].text(0.5, -1.0, 'PC2', ha='center', va='center', fontsize=24) axarr[0,2].text(0.5, -1.0, 'PC3', ha='center', va='center', fontsize=24) fig.legend([line1, line2], ('PS', 'WS'), loc='upper center', ncol=2, frameon=False) fig.subplots_adjust(hspace=0.12, wspace=0.1) fig.text(0.5, 0.06, 'Scatter Plot Matrix', ha='center', va='center', fontsize=18) # loadings pca.components_.shape # randomly sample 50 test set import random cell_test_subset = cell_test.iloc[np.sort(random.sample(range(cell_test.shape[0]), 50))] # separate features cell_test_subset_f = cell_test_subset.iloc[:, 4:].drop('VarIntenCh3', 1) cell_test_subset_v = cell_test_subset.iloc[:, 4:]['VarIntenCh3'] cell_train_f = cell_train_feature.drop('VarIntenCh3', 1) cell_train_v = cell_train_feature['VarIntenCh3'] # scale and center before imputation from sklearn.preprocessing import StandardScaler # standardize based on training set sc_f = StandardScaler() cell_train_f_sc = sc_f.fit_transform(cell_train_f) cell_test_subset_f_sc = sc_f.transform(cell_test_subset_f) sc_v = StandardScaler() cell_train_v_sc = sc_v.fit_transform(cell_train_v) cell_test_subset_v_sc = sc_v.transform(cell_test_subset_v) # use 5-nearest neighbor from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors = 5) nbrs.fit(cell_train_f_sc) # based on training set distance, indices = nbrs.kneighbors(cell_test_subset_f_sc) # neighbors for test set # imputation cell_test_subset_v_pred_knn = np.empty(50) for idx, i in enumerate(indices): cell_test_subset_v_pred_knn[idx] = np.mean(cell_train_v_sc[i[1:]]) from scipy.stats.stats import pearsonr print "corr('VarIntenCh3', 'DiffIntenDensityCh3') is {0}".format(pearsonr(cell_train_v, cell_train_f['DiffIntenDensityCh3'])[0]) # use linear model from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(cell_train_f_sc[:, cell_train_f.columns.get_loc('DiffIntenDensityCh3')][:, np.newaxis], cell_train_v_sc[:, np.newaxis]) # find the predictor with highest correlation cell_test_subset_v_pred_lm = \ lm.predict(cell_test_subset_f_sc[:, cell_train_f.columns.get_loc('DiffIntenDensityCh3')][:, np.newaxis]) print "kNN: {0}".format(pearsonr(cell_test_subset_v_sc, cell_test_subset_v_pred_knn)[0]) print "Linear Model: {0}".format(pearsonr(cell_test_subset_v_sc[:, np.newaxis], cell_test_subset_v_pred_lm)[0][0]) fig, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(cell_test_subset_v_sc, cell_test_subset_v_pred_knn) ax1.set(xlim=(-1.5, 3), ylim=(-1.5, 3)) ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3") ax1.set_title('5NN') ax2.scatter(cell_test_subset_v_sc, cell_test_subset_v_pred_lm) ax2.set(xlim=(-1.5, 3), ylim=(-1.5, 3)) ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls="--", c=".3") ax2.set_title('Linear Model') fig.text(0.5, 0.04, 'Original Value (centered and scaled)', ha='center', va='center') fig.text(0.06, 0.5, 'Imputed', ha='center', va='center', rotation='vertical') # calculate the correlation matrix corr_dataframe = cell_train_feature.corr() # compute hierarchical cluster on both rows and columns for correlation matrix and plot heatmap def corr_heatmap(corr_dataframe): import scipy.cluster.hierarchy as sch corr_matrix = np.array(corr_dataframe) col_names = corr_dataframe.columns Y = sch.linkage(corr_matrix, 'single', 'correlation') Z = sch.dendrogram(Y, color_threshold=0, no_plot=True)['leaves'] corr_matrix = corr_matrix[Z, :] corr_matrix = corr_matrix[:, Z] col_names = col_names[Z] im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto', cmap='bwr') plt.colorbar() plt.xticks(range(corr_matrix.shape[0]), col_names, rotation='vertical', fontsize=4) plt.yticks(range(corr_matrix.shape[0]), col_names[::-1], fontsize=4) # plot corr_heatmap(corr_dataframe) !ls -l ../datasets/GermanCredit/ credit_data = pd.read_csv("../datasets/GermanCredit/GermanCredit.csv") credit_data.head(5) credit_data.shape credit_data_saving = credit_data[['SavingsAccountBonds.lt.100', 'SavingsAccountBonds.100.to.500', 'SavingsAccountBonds.500.to.1000', 'SavingsAccountBonds.gt.1000', 'SavingsAccountBonds.Unknown']] credit_data_saving.head(10) credit_data_saving.apply(np.sum)