#!/usr/bin/env python # coding: utf-8 # # Getting the necessary data # You just need to do this only once # In[5]: get_ipython().system('rm -f genotypes.vcf.gz 2>/dev/null') get_ipython().system('tabix -fh ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/supporting/vcf_with_sample_level_annotation/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5_extra_anno.20130502.genotypes.vcf.gz 22:1-17000000|bgzip -c > genotypes.vcf.gz') get_ipython().system('tabix -p vcf genotypes.vcf.gz') # In[2]: from collections import defaultdict get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns import matplotlib.pyplot as plt import vcf # In[3]: v = vcf.Reader(filename='genotypes.vcf.gz') print('Variant Level information') infos = v.infos for info in infos: print(info) print('Sample Level information') fmts = v.formats for fmt in fmts: print(fmt) # In[4]: v = vcf.Reader(filename='genotypes.vcf.gz') rec = next(v) print(rec.CHROM, rec.POS, rec.ID, rec.REF, rec.ALT, rec.QUAL, rec.FILTER) print(rec.INFO) print(rec.FORMAT) samples = rec.samples print(len(samples)) sample = samples[0] print(sample.called, sample.gt_alleles, sample.is_het, sample.is_variant, sample.phased) print(int(sample['DP'])) # In[5]: f = vcf.Reader(filename='genotypes.vcf.gz') my_type = defaultdict(int) num_alts = defaultdict(int) for rec in f: my_type[rec.var_type, rec.var_subtype] += 1 if rec.is_snp: num_alts[len(rec.ALT)] += 1 print(my_type) print(num_alts) # In[6]: f = vcf.Reader(filename='genotypes.vcf.gz') sample_dp = defaultdict(int) for rec in f: if not rec.is_snp or len(rec.ALT) != 1: continue for sample in rec.samples: dp = sample['DP'] if dp is None: dp = 0 dp = int(dp) sample_dp[dp] += 1 # In[8]: dps = sample_dp.keys() dps.sort() dp_dist = [sample_dp[x] for x in dps] fig, ax = plt.subplots(figsize=(16, 9)) ax.plot(dp_dist[:50], 'r') ax.axvline(dp_dist.index(max(dp_dist))) # In[ ]: