# Load data
from sklearn.datasets import load_digits
data = load_digits()
X, y = data.data, data.target

print X
print X.shape
print X.dtype

# Instantiate a classifier
from sklearn.tree import DecisionTreeClassifier # Change this
clf = DecisionTreeClassifier()                  # ... and that

# Do stuff (and rely on the common interface across estimators)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score

pipeline = Pipeline([("pca", PCA(n_components=10)), 
                     ("classifier", clf)])

print "Accuracy =", cross_val_score(pipeline, X, y, cv=3).mean()

%%bash
# Load the data from chromosome 15
mkdir -p data
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_ASW_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CEU_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CHB_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_CHD_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_GIH_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_JPT_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_LWK_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_MEX_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_MKK_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_TSI_r28_nr.b36_fwd.txt.gz
wget -P data http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/genotypes_chr15_YRI_r28_nr.b36_fwd.txt.gz
for f in data/*.gz; do gunzip $f; done

# Look at the 3 first lines of the data 
!head -n 3 data/genotypes_chr15_ASW_r28_nr.b36_fwd.txt

import numpy as np
import pandas as pd

def load_genotypes(filename, population):
    # Load the data into a dataframe
    df = pd.read_csv(filename, sep=" ", index_col=0)

    # Map genotypes to 0-1-2 values
    mask = df.alleles == 'A/C'
    df[mask] = df[mask].replace(to_replace=['AA', 'AC', 'CC'], 
                                value=[0, 1, 2])
    mask = df.alleles == 'A/G'
    df[mask] = df[mask].replace(to_replace=['AA', 'AG', 'GG'], 
                                value=[0, 1, 2])
    mask = df.alleles == 'A/T'
    df[mask] = df[mask].replace(to_replace=['AA', 'AT', 'TT'], 
                                value=[0, 1, 2])
    mask = df.alleles == 'C/G'
    df[mask] = df[mask].replace(to_replace=['CC', 'CG', 'GG'], 
                                value=[0, 1, 2])
    mask = df.alleles == 'C/T'
    df[mask] = df[mask].replace(to_replace=['CC', 'CT', 'TT'], 
                                value=[0, 1, 2])
    mask = df.alleles == 'G/T'
    df[mask] = df[mask].replace(to_replace=['GG', 'GT', 'TT'], 
                                value=[0, 1, 2])
    
    # Replace missing value placeholders
    df.replace(to_replace='NN', value=-1, inplace=True)

    # Copy data into a new frame
    individuals = df.columns[10:]
    snps = df.index

    new_df = pd.DataFrame(index=individuals)
    new_df["population"] = population
    new_df[snps] = df[individuals].T
    new_df = new_df.convert_objects()

    # Drop SNPs corresponding to deletions/insertions
    new_df = new_df.drop(new_df.columns[new_df.dtypes == "object"][1:], axis=1)

    return new_df

# Concatenate all files and only keep common SNPs
filename = "genotypes_chr15_%s_r28_nr.b36_fwd.txt"
populations = ["ASW", "CEU", "CHB", "CHD", "GIH", "JPT", 
               "LWK", "MEX", "MKK", "TSI", "YRI"]

df = pd.concat([load_genotypes(filename % population, population) 
                    for population in populations], 
               join="inner")

# Convert to Numpy arrays
snps = df.columns[1:]
X = df[snps].values.astype(np.int8)
y = df["population"].values.astype(np.str)

# Save for later
np.savez("data/chr15-all-numpy.npz", X=X, y=y, snps=snps)

# Warning! This takes long...

# To save time, let us load the binary files generated above
import numpy as np

data = np.load("data/chr15-all-numpy.npz")
snps = data["snps"]
X = data["X"]
y = data["y"]

print "SNPs =", snps[:10]
print "X =", X
print "X.shape =", X.shape
print "y =", y

# Limit to 3000 SNPs for faster interactions
snps = snps[:3000]
X = X[:, :3000]

print "Before imputation:", np.unique(X)

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values=-1, strategy="most_frequent")
X = imputer.fit_transform(X)

print "After imputation:", np.unique(X)

# Project the data to a 2D space for visualization
from sklearn.decomposition import RandomizedPCA
Xp = RandomizedPCA(n_components=2, random_state=1).fit_transform(X)

# Setup matplotlib to work interactively
%matplotlib inline
from matplotlib import pyplot as plt

# Plot individuals 
populations = np.unique(y)
colors = plt.get_cmap("hsv")
plt.figure(figsize=(10, 4))

for i, p in enumerate(populations):
    mask = (y == p)
    plt.scatter(Xp[mask, 0], Xp[mask, 1], 
                c=colors(1. * i / 11), label=p)
    
plt.xlim([-30, 50])
plt.legend(loc="best")
plt.show()

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

from sklearn.linear_model import RidgeClassifierCV
clf = RidgeClassifierCV().fit(X_train, y_train)
clf.score(X_test, y_test)

from sklearn.metrics import confusion_matrix

def print_cm(cm, labels):
    # Nicely print the confusion matrix
    print " " * 4,
    for label in labels:
        print " %s" % label,
    print
    
    for i, label1 in enumerate(labels):
        print label1,
        for j, label2 in enumerate(labels):
            print "%4d" % cm[i, j],
        print
    
print_cm(confusion_matrix(y_test, 
                          clf.predict(X_test), 
                          labels=populations), populations)

# Plot coefficients
coef = np.mean(np.abs(clf.coef_), axis=0)

plt.figure(figsize=(10, 4))
plt.bar(range(len(snps)), coef)
plt.show()

# Top 10 SNPs
indices = np.argsort(coef)[::-1]

for i in range(10):
    print coef[indices[i]], snps[indices[i]]

from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, 
                           max_features=0.2, 
                           n_jobs=2,
                           random_state=1).fit(X_train, y_train)
clf.score(X_test, y_test)

print_cm(confusion_matrix(y_test, 
                          clf.predict(X_test), 
                          labels=populations), populations)

# Plot importances
importances = clf.feature_importances_

plt.figure(figsize=(10, 4))
plt.bar(range(len(snps)), importances)
plt.show()

# Top 10 SNPs
indices = np.argsort(importances)[::-1]

for i in range(10):
    print importances[indices[i]], snps[indices[i]]