#!/usr/bin/env python # coding: utf-8 # In[1]: from collections import defaultdict import os import matplotlib.pyplot as plt from genomics.popgen.admix import cluster, plot get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: k_range = range(2, 10) # 2..9 # ### The next cell is very slow. Example outputs are provided (so you can avoid running it) # In[3]: #for k in k_range: # os.system('admixture --cv=10 hapmap10_auto_noofs_ld.bed %d > admix.%d' % (k, k)) # ## Individual order # In[4]: f = open('hapmap10_auto_noofs_ld.fam') ind_order = [] for l in f: toks = l.rstrip().replace(' ', '\t').split('\t') fam_id = toks[0] ind_id = toks[1] ind_order.append((fam_id, ind_id)) f.close() # ## CV-plot # In[5]: CVs = [] for k in k_range: f = open('admix.%d' % k) for l in f: if l.find('CV error') > -1: CVs.append(float(l.rstrip().split(' ')[-1])) break f.close() fig = plt.figure(figsize=(16, 9)) ax = fig.add_subplot(111) ax.plot(k_range, CVs) ax.set_title('Cross-Validation error') ax.set_xlabel('K') # ## Load meta-data # In[6]: f = open('relationships_w_pops_121708.txt') pop_ind = defaultdict(list) f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] if (fam_id, ind_id) not in ind_order: continue mom = toks[2] dad = toks[3] if mom != '0' or dad != '0': continue pop = toks[-1] pop_ind[pop].append((fam_id, ind_id)) #ind_pop[('2469', 'NA20281')] = ind_pop[('2805', 'NA20281')] f.close() # In[7]: def load_Q(fname, ind_order): ind_comps = {} f = open(fname) for i, l in enumerate(f): comps = [float(x) for x in l.rstrip().split(' ')] ind_comps[ind_order[i]] = comps f.close() return ind_comps # In[8]: comps = {} for k in k_range: comps[k] = load_Q('hapmap10_auto_noofs_ld.%d.Q' % k, ind_order) # In[9]: ordering = {} for k in k_range: ordering[k] = cluster(comps[k], pop_ind) # In[10]: plot.single(comps[4], ordering[4]) # In[11]: fig = plt.figure(figsize=(16, 9)) plot.stacked(comps, ordering[7], fig) # ## Q files? # ## Log-likelihood # In[ ]: