%matplotlib inline import matplotlib.pyplot as plt import numpy as np # Some nice default configuration for plots plt.rcParams['figure.figsize'] = 10, 7.5 plt.rcParams['axes.grid'] = True plt.gray() cd ../../features/ posdata = loadtxt("human.iRefIndex.positive.vectors.txt", delimiter="\t", dtype="str") negdata = loadtxt("human.iRefIndex.negative.vectors.txt", delimiter="\t", dtype="str") X = np.concatenate((posdata,negdata)) plottablerows = X=="missing" plottablerows = where(plottablerows.max(axis=1)==False) NAinds = where(X=="NA") X[NAinds] = 0.0 X[X=="missing"] = np.nan X = X.astype(np.float) #create the target y vector y = array([1]*len(posdata)+[0]*len(negdata)) def propy(y,samplesize=20000): """Function to produce proportional samples of y from the full dataset. Takes as an input: y - the vector of class labels, samplesize - size of sample Outputs: mody - modified y vector, transformdict - dictionary to map between indices""" enuy = list(enumerate(y)) shuffle(enuy) nones = int(samplesize/600) nzeros = int((samplesize*599)/600) transformdict = {} mody = [] #iterate over this list and pull out the required number of ones and zeros for j,(i,label) in enumerate(enuy): if label == 1 and nones > 0: transformdict[j] = i mody.append(label) nones += -1 elif label == 0 and nzeros > 0: transformdict[j] = i mody.append(label) nzeros += -1 if nzeros == 0 and nones == 0: break return mody, transformdict mody,tfdict = propy(y,samplesize=400000) propindexes = array(tfdict.values()) X,y = X[propindexes],y[propindexes] len(X),len(y) cd tmp/ import sklearn.linear_model import sys, os sys.path.append(os.path.abspath("../../opencast-bio/")) import ocbio.model_selection, ocbio.mmap_utils reload(ocbio.model_selection) reload(ocbio.mmap_utils) !ipcluster stop !ipcluster start -n=10 --daemon from IPython.parallel import * client = Client(profile='default') #initialise load balanced distributed processing lb_view = client.load_balanced_view() import sklearn.pipeline, sklearn.preprocessing imputer = sklearn.preprocessing.Imputer(strategy='mean',missing_values="NaN") scaler = sklearn.preprocessing.StandardScaler() classifier = sklearn.linear_model.LogisticRegression() model = sklearn.pipeline.Pipeline([('imp',imputer),('scl',scaler),('cls',classifier)]) trainsizes=logspace(3,5,5) #initialise and start run lcsearch = ocbio.model_selection.LearningCurve(lb_view) lcsearch.launch_for_arrays(model,X,y,trainsizes,n_cv_iter=5,params={'cls__C':0.316}, name="lrlc") print lcsearch lrlc = lcsearch.plot_curve() savez("../../plots/bayes/logistic.regression.learning.curve.npz",lrlc) !ipcluster stop --profile=altcluster !ipcluster start -n=10 --profile=altcluster --daemon altclient = Client(profile='altcluster') alb_view = altclient.load_balanced_view() #make sure the processors aren't busy doing anything else: alb_view.abort() #initialise parameters to test: lr_params = { 'cls__C':logspace(-3,1,7) } #initialise mmaped data - had to increase CV iterations to reduce output variance split_filenames = ocbio.mmap_utils.persist_cv_splits(X,y, test_size=10000, train_size=40000, n_cv_iter=10, name='testfeatures',random_state=42) #initialise and start run lrsearch = ocbio.model_selection.RandomizedGridSearch(alb_view) lrsearch.launch_for_splits(model,lr_params,split_filenames) print lrsearch.report() import sklearn.dummy def dummycompare(X,y,train,test,best_score): dummy = sklearn.dummy.DummyClassifier(strategy="most_frequent") dummy.fit(imputer.fit_transform(X[train]),y[train]) score = dummy.score(imputer.fit_transform(X[test]),y[test]) if score > best_score: return 0.0 else: #using a logit function here on a biased and scaled best score best_score = (best_score-score)*(1.0/(1.0-score)) return best_score,log(best_score/(1.0-best_score)) results = np.zeros([5,2]) for i,(train,test) in enumerate(sklearn.cross_validation.StratifiedShuffleSplit(y,n_iter=5, test_size=10000, train_size=40000)): results[i,:] = dummycompare(X,y,train,test,lrsearch.find_bests()[0][0]) print mean(results,axis=0) def plotroc(model,Xtest,ytest): estimates = model.predict_proba(Xtest) fpr, tpr, thresholds = sklearn.metrics.roc_curve(ytest,estimates[:,1]) clf() plot(fpr, tpr) plot([0, 1], [0, 1], 'k--') xlim([0.0, 1.0]) ylim([0.0, 1.0]) xlabel('False Positive Rate') ylabel('True Positive Rate') title('Receiver operating characteristic AUC: {0}'.format(sklearn.metrics.roc_auc_score(ytest,estimates[:,1]))) show() return fpr,tpr train,test = next(iter(sklearn.cross_validation.StratifiedShuffleSplit(y, test_size=0.25))) bestparams = lrsearch.find_bests()[0][-1] %%time model.set_params(**bestparams) model.fit(X[train],y[train]) fpr,tpr = plotroc(model,X[test],y[test]) savez("../../plots/bayes/logistic.regression.roc.npz",fpr,tpr) print dummycompare(X,y,train,test,model.score(X[test],y[test])) def drawprecisionrecall(X,y,test,model): try: yscore = model.decision_function(X[test]) except AttributeError: yscore = model.predict_proba(X[test])[:,1] precision,recall,_ = sklearn.metrics.precision_recall_curve(y[test],yscore) average_precision = sklearn.metrics.average_precision_score(y[test], yscore, average="micro") plt.clf() plt.plot(recall,precision) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve, AUC: {0:0.2f}'.format(average_precision)) plt.show() return precision, recall precision,recall = drawprecisionrecall(X,y,test,model) savez("../../plots/bayes/lr_precisionrecall.npz",precision,recall) plot(model.named_steps['cls'].coef_.T) savez("../../plots/bayes/logistic.regression.coef.npz",model.named_steps['cls'].coef_.T) where(model.named_steps['cls'].coef_.T==amax(model.named_steps['cls'].coef_.T)) import sklearn.svm imputer = sklearn.preprocessing.Imputer(strategy='mean',missing_values="NaN") scaler = sklearn.preprocessing.StandardScaler() classifier = sklearn.svm.SVC() svmodel = sklearn.pipeline.Pipeline([('imp',imputer),('scl',scaler),('cls',classifier)]) trainsizes=logspace(3,4.5,5) #initialise and start run lcsearch = ocbio.model_selection.LearningCurve(lb_view) lcsearch.launch_for_arrays(svmodel,X,y,trainsizes,n_cv_iter=3,params={'cls__C':1.0}, name="lrlc") print lcsearch svmlc = lcsearch.plot_curve() savez("../../plots/bayes/svmlc.npz",svmlc) #initialise parameters to test: svm_params = { 'cls__C':logspace(-1,1,3), 'cls__kernel':['rbf','poly','sigmoid'], 'cls__gamma':logspace(-1,1,3) } #initialise mmaped data split_filenames = ocbio.mmap_utils.persist_cv_splits(X,y, test_size=2000, train_size=10000, n_cv_iter=3, name='testfeatures',random_state=42) #initialise and start run search = ocbio.model_selection.RandomizedGridSearch(alb_view) search.launch_for_splits(svmodel,svm_params,split_filenames) print search train,test = next(iter(sklearn.cross_validation.StratifiedShuffleSplit(y, test_size=2000,train_size=10000))) print len(train),len(test) svmparams = search.find_bests()[0][-1] svmparams['cls__probability'] = True %%time svmodel.set_params(**svmparams) svmodel.fit(X[train],y[train]) fpr,tpr = plotroc(svmodel,X[test],y[test]) cd tmp savez("../../plots/bayes/svm.roc.npz",fpr,tpr) svmprecision,svmrecall = drawprecisionrecall(X,y,test,svmodel) savez("../../plots/bayes/svm.precisionrecall.npz",svmprecision,svmrecall) import sklearn.ensemble imputer = sklearn.preprocessing.Imputer(strategy='mean',missing_values="NaN") scaler = sklearn.preprocessing.StandardScaler() classifier = sklearn.ensemble.RandomForestClassifier() rfmodel = sklearn.pipeline.Pipeline([('imp',imputer),('scl',scaler),('cls',classifier)]) trainsizes=logspace(3,5,7) #initialise and start run rflcsearch = ocbio.model_selection.LearningCurve(alb_view) rflcsearch.launch_for_arrays(rfmodel,X,y,trainsizes,n_cv_iter=3, params={'cls__n_estimators':100,'cls__bootstrap':False}, name="lrlc") print rflcsearch rflc = rflcsearch.plot_curve() savez("../../plots/bayes/random.forest.learning.curve.npz",rflc) split_filenames = ocbio.mmap_utils.persist_cv_splits(X,y, test_size=20000, train_size=100000, n_cv_iter=3, name='rffeatures',random_state=42) rfparams = {'cls__n_estimators':100,'cls__bootstrap':False} %%time train,test = next(iter(sklearn.cross_validation.StratifiedShuffleSplit(y, test_size=20000,train_size=100000))) rfmodel.set_params(**rfparams) rfmodel.fit(X[train],y[train]) print "Training score: {0}".format(rfmodel.score(X[train],y[train])) print "Test score: {0}".format(rfmodel.score(X[test],y[test])) rf_params = { 'cls__n_estimators':logspace(1,2.3,5).astype(np.int), 'cls__max_features':arange(5,30,5).astype(np.int), 'cls__bootstrap':[False] } rfsearch = ocbio.model_selection.RandomizedGridSearch(lb_view) rfsearch.launch_for_splits(rfmodel,rf_params,split_filenames) print rfsearch rfparams = rfsearch.find_bests()[0][-1] %%time train,test = next(iter(sklearn.cross_validation.StratifiedShuffleSplit(y, test_size=20000,train_size=200000))) rfmodel.set_params(**rfparams) rfmodel.fit(X[train],y[train]) print "Training score: {0}".format(rfmodel.score(X[train],y[train])) print "Test score: {0}".format(rfmodel.score(X[test],y[test])) rfroc = plotroc(rfmodel,X[test],y[test]) savez("../../plots/bayes/random.forest.roc.npz",rfroc) rfprec,rfrecall = drawprecisionrecall(X,y,test,rfmodel) !git annex unlock ../../plots/bayes/random.forest.precisionrecall.npz savez("../../plots/bayes/random.forest.precisionrecall.npz",rfprec,rfrecall) dummycompare(X,y,train,test,rfsearch.find_bests()[0][0]) imputer = sklearn.preprocessing.Imputer(strategy='mean',missing_values="NaN") scaler = sklearn.preprocessing.StandardScaler() classifier = sklearn.ensemble.ExtraTreesClassifier() erfmodel = sklearn.pipeline.Pipeline([('imp',imputer),('scl',scaler),('cls',classifier)]) extrasearch = ocbio.model_selection.RandomizedGridSearch(alb_view) extrasearch.launch_for_splits(erfmodel,rf_params,split_filenames) print extrasearch importances = rfmodel.named_steps['cls'].feature_importances_ plot(importances) savez("../../plots/bayes/random.forest.importances.npz",importances) cd .. import pickle f = open("random.forest.bayes.classifier.pickle","wb") pickle.dump(rfmodel,f) f.close() edgelist = loadtxt("human.activezone.txt",dtype=str) NAinds = where(edgelist=="NA") edgelist[NAinds] = 0.0 edgelist[edgelist=="missing"] = np.nan edgelist = edgelist.astype(np.float) estimates = rfmodel.predict_proba(edgelist) print estimates[5,1] import csv f,fw = open("../forGAVIN/mergecode/OUT/edgelist.txt"), open("../forGAVIN/mergecode/OUT/edgelist_weighted.txt","w") c,cw = csv.reader(f, delimiter="\t"), csv.writer(fw, delimiter="\t") c.next() for i,l in enumerate(c): cw.writerow(l + [estimates[i,1]]) f.close(),fw.close() oneindexes = where(y==1) zeroindexes = where(y==0) onesamples = rfmodel.predict_proba(X[oneindexes]) zerosamples = rfmodel.predict_proba(X[zeroindexes]) f = open("random.forest.bayes.dist.samples.pickle","wb") pickle.dump({1:onesamples,0:zerosamples},f) f.close()