# Load data from sklearn.datasets import load_digits data = load_digits() X, y = data.data, data.target print X print X.shape print X.dtype # Instantiate a classifier from sklearn.tree import DecisionTreeClassifier # Change this clf = DecisionTreeClassifier() # ... and that # Do stuff (and rely on the common interface across estimators) from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.cross_validation import cross_val_score pipeline = Pipeline([("pca", PCA(n_components=10)), ("classifier", clf)]) print "Accuracy =", cross_val_score(pipeline, X, y, cv=3).mean() %%bash # Load the data from chromosome 15 mkdir -p data wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_ASW_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CEU_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CHB_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CHD_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_GIH_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_JPT_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_LWK_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_MEX_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_MKK_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_TSI_r28_nr.b36_fwd.txt.gz wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_YRI_r28_nr.b36_fwd.txt.gz for f in data/*.gz; do gunzip $f; done # Look at the 3 first lines of the data !head -n 3 data/genotypes_chr15_ASW_r28_nr.b36_fwd.txt import numpy as np import pandas as pd def load_genotypes(filename, population): # Load the data into a dataframe df = pd.read_csv(filename, sep=" ", index_col=0) # Map genotypes to 0-1-2 values mask = df.alleles == 'A/C' df[mask] = df[mask].replace(to_replace=['AA', 'AC', 'CC'], value=[0, 1, 2]) mask = df.alleles == 'A/G' df[mask] = df[mask].replace(to_replace=['AA', 'AG', 'GG'], value=[0, 1, 2]) mask = df.alleles == 'A/T' df[mask] = df[mask].replace(to_replace=['AA', 'AT', 'TT'], value=[0, 1, 2]) mask = df.alleles == 'C/G' df[mask] = df[mask].replace(to_replace=['CC', 'CG', 'GG'], value=[0, 1, 2]) mask = df.alleles == 'C/T' df[mask] = df[mask].replace(to_replace=['CC', 'CT', 'TT'], value=[0, 1, 2]) mask = df.alleles == 'G/T' df[mask] = df[mask].replace(to_replace=['GG', 'GT', 'TT'], value=[0, 1, 2]) # Replace missing value placeholders df.replace(to_replace='NN', value=-1, inplace=True) # Copy data into a new frame individuals = df.columns[10:] snps = df.index new_df = pd.DataFrame(index=individuals) new_df["population"] = population new_df[snps] = df[individuals].T new_df = new_df.convert_objects() # Drop SNPs corresponding to deletions/insertions new_df = new_df.drop(new_df.columns[new_df.dtypes == "object"][1:], axis=1) return new_df # Concatenate all files and only keep common SNPs filename = "genotypes_chr15_%s_r28_nr.b36_fwd.txt" populations = ["ASW", "CEU", "CHB", "CHD", "GIH", "JPT", "LWK", "MEX", "MKK", "TSI", "YRI"] df = pd.concat([load_genotypes(filename % population, population) for population in populations], join="inner") # Convert to Numpy arrays snps = df.columns[1:] X = df[snps].values.astype(np.int8) y = df["population"].values.astype(np.str) # Save for later np.savez("data/chr15-all-numpy.npz", X=X, y=y, snps=snps) # Warning! This takes long... # To save time, let us load the binary files generated above import numpy as np data = np.load("data/chr15-all-numpy.npz") snps = data["snps"] X = data["X"] y = data["y"] print "SNPs =", snps[:10] print "X =", X print "X.shape =", X.shape print "y =", y # Limit to 3000 SNPs for faster interactions snps = snps[:3000] X = X[:, :3000] print "Before imputation:", np.unique(X) from sklearn.preprocessing import Imputer imputer = Imputer(missing_values=-1, strategy="most_frequent") X = imputer.fit_transform(X) print "After imputation:", np.unique(X) # Project the data to a 2D space for visualization from sklearn.decomposition import RandomizedPCA Xp = RandomizedPCA(n_components=2, random_state=1).fit_transform(X) # Setup matplotlib to work interactively %matplotlib inline from matplotlib import pyplot as plt # Plot individuals populations = np.unique(y) colors = plt.get_cmap("hsv") plt.figure(figsize=(10, 4)) for i, p in enumerate(populations): mask = (y == p) plt.scatter(Xp[mask, 0], Xp[mask, 1], c=colors(1. * i / 11), label=p) plt.xlim([-30, 50]) plt.legend(loc="best") plt.show() from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) from sklearn.linear_model import RidgeClassifierCV clf = RidgeClassifierCV().fit(X_train, y_train) clf.score(X_test, y_test) from sklearn.metrics import confusion_matrix def print_cm(cm, labels): # Nicely print the confusion matrix print " " * 4, for label in labels: print " %s" % label, print for i, label1 in enumerate(labels): print label1, for j, label2 in enumerate(labels): print "%4d" % cm[i, j], print print_cm(confusion_matrix(y_test, clf.predict(X_test), labels=populations), populations) # Plot coefficients coef = np.mean(np.abs(clf.coef_), axis=0) plt.figure(figsize=(10, 4)) plt.bar(range(len(snps)), coef) plt.show() # Top 10 SNPs indices = np.argsort(coef)[::-1] for i in range(10): print coef[indices[i]], snps[indices[i]] from sklearn.ensemble import ExtraTreesClassifier clf = ExtraTreesClassifier(n_estimators=100, max_features=0.2, n_jobs=2, random_state=1).fit(X_train, y_train) clf.score(X_test, y_test) print_cm(confusion_matrix(y_test, clf.predict(X_test), labels=populations), populations) # Plot importances importances = clf.feature_importances_ plt.figure(figsize=(10, 4)) plt.bar(range(len(snps)), importances) plt.show() # Top 10 SNPs indices = np.argsort(importances)[::-1] for i in range(10): print importances[indices[i]], snps[indices[i]]