from collections import defaultdict
from Bio.PopGen.GenePop import read
from Bio.PopGen.GenePop.LargeFileParser import read as read_large
from genomics.popgen.plink.convert import to_genepop
f = open('relationships_w_pops_121708.txt')
pop_ind = defaultdict(list)
f.readline() # header
for line in f:
toks = line.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
pop = toks[-1]
pop_ind[pop].append((fam_id, ind_id))
all_inds = []
for inds in pop_ind.values():
all_inds.extend(inds)
for line in open('hapmap1.ped'):
toks = line.rstrip().replace(' ', '\t').split('\t')
fam = toks[0]
ind = toks[1]
if (fam, ind) not in all_inds:
print('Problems with %s/%s' % (fam, ind))
Problems with 2469/NA20281
to_genepop('hapmap1_auto', 'hapmap1_auto', pop_ind)
to_genepop('hapmap10', 'hapmap10', pop_ind)
to_genepop('hapmap10_auto', 'hapmap10_auto', pop_ind)
to_genepop('hapmap10_auto_noofs_ld', 'hapmap10_auto_noofs_ld', pop_ind)
to_genepop('hapmap10_auto_noofs_2', 'hapmap10_auto_noofs_2', pop_ind)
#slow
#talk about coding: ACGT - 1234
#talk about pop Pop names
Careful: this will severely increase your memory usage, consider it optional
rec = read(open('hapmap1_auto.gp'))
print('Header: %s' % len(rec.comment_line))
print('Number of loci %d' % len(rec.loci_list))
print('Number of populations %d' % len(rec.pop_list))
print('Population names: %s' % ', '.join(rec.pop_list))
print('Individuals per population %s' % ', '.join([str(len(inds)) for inds in rec.populations]))
ind = rec.populations[1][0]
print('Individual %s, SNP %s, alleles: %d %d' % (ind[0], rec.loci_list[0], ind[1][0][0], ind[1][0][1]))
del rec
#talk about population names
def count_individuals(fname):
rec = read_large(open(fname))
pop_sizes = []
for line in rec.data_generator():
if line == ():
pop_sizes.append(0)
else:
pop_sizes[-1] += 1
return pop_sizes
print('Individuals per population %s' % ', '.join([str(ninds) for ninds in count_individuals('hapmap1_auto.gp')]))
Individuals per population 82, 165, 84, 85, 88, 86, 90, 77, 171, 88, 167
print(len(read_large(open('hapmap10.gp')).loci_list))
print(len(read_large(open('hapmap10_auto.gp')).loci_list))
print(len(read_large(open('hapmap10_auto_noofs_ld.gp')).loci_list))
143809 138694 55217