#!/usr/bin/env python # coding: utf-8 # In[1]: from collections import defaultdict from Bio.PopGen.GenePop import read from Bio.PopGen.GenePop.LargeFileParser import read as read_large from genomics.popgen.plink.convert import to_genepop # In[2]: f = open('relationships_w_pops_121708.txt') pop_ind = defaultdict(list) f.readline() # header for line in f: toks = line.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] pop_ind[pop].append((fam_id, ind_id)) # ## Check for consistency issues # In[3]: all_inds = [] for inds in pop_ind.values(): all_inds.extend(inds) for line in open('hapmap1.ped'): toks = line.rstrip().replace(' ', '\t').split('\t') fam = toks[0] ind = toks[1] if (fam, ind) not in all_inds: print('Problems with %s/%s' % (fam, ind)) # ## Convert from PLINK to Genepop # In[4]: to_genepop('hapmap1_auto', 'hapmap1_auto', pop_ind) to_genepop('hapmap10', 'hapmap10', pop_ind) to_genepop('hapmap10_auto', 'hapmap10_auto', pop_ind) to_genepop('hapmap10_auto_noofs_ld', 'hapmap10_auto_noofs_ld', pop_ind) to_genepop('hapmap10_auto_noofs_2', 'hapmap10_auto_noofs_2', pop_ind) #slow #talk about coding: ACGT - 1234 #talk about pop Pop names # ## Using the in-memory parser # Careful: this will severely increase your memory usage, consider it optional # In[ ]: rec = read(open('hapmap1_auto.gp')) print('Header: %s' % len(rec.comment_line)) print('Number of loci %d' % len(rec.loci_list)) print('Number of populations %d' % len(rec.pop_list)) print('Population names: %s' % ', '.join(rec.pop_list)) print('Individuals per population %s' % ', '.join([str(len(inds)) for inds in rec.populations])) ind = rec.populations[1][0] print('Individual %s, SNP %s, alleles: %d %d' % (ind[0], rec.loci_list[0], ind[1][0][0], ind[1][0][1])) del rec #talk about population names # ## Using the Large File Parser # In[5]: def count_individuals(fname): rec = read_large(open(fname)) pop_sizes = [] for line in rec.data_generator(): if line == (): pop_sizes.append(0) else: pop_sizes[-1] += 1 return pop_sizes # In[6]: print('Individuals per population %s' % ', '.join([str(ninds) for ninds in count_individuals('hapmap1_auto.gp')])) # In[7]: print(len(read_large(open('hapmap10.gp')).loci_list)) print(len(read_large(open('hapmap10_auto.gp')).loci_list)) print(len(read_large(open('hapmap10_auto_noofs_ld.gp')).loci_list)) # In[ ]: