#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null') get_ipython().system('wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz') get_ipython().system('wget https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz -O atroparvus.fa.gz') # In[1]: from __future__ import division import gzip from Bio import SeqIO, SeqUtils # In[2]: gambiae_name = 'gambiae.fa.gz' atroparvus_name = 'atroparvus.fa.gz' # In[3]: recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta') for rec in recs: print(rec.description) #Do not do this with atroparvus # In[4]: recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta') chrom_Ns = {} chrom_sizes = {} for rec in recs: chrom = rec.description.split(':')[2] if chrom in ['UNKN', 'Y_unplaced']: continue chrom_Ns[chrom] = [] on_N = False curr_size = 0 for pos, nuc in enumerate(rec.seq): if nuc in ['N', 'n']: curr_size += 1 on_N = True else: if on_N: chrom_Ns[chrom].append(curr_size) curr_size = 0 on_N = False if on_N: chrom_Ns[chrom].append(curr_size) chrom_sizes[chrom] = len(rec.seq) # In[5]: for chrom, Ns in chrom_Ns.items(): size = chrom_sizes[chrom] print('%s (%s): %%Ns (%.1f), num Ns: %d, max N: %d' % (chrom, size, 100 * sum(Ns) / size, len(Ns), max(Ns))) # ## Atroparvus super-contigs # In[6]: import numpy as np #SeqUtils recs = SeqIO.parse(gzip.open(atroparvus_name), 'fasta') sizes = [] size_N = [] for rec in recs: size = len(rec.seq) sizes.append(size) count_N = 0 for nuc in rec.seq: if nuc in ['n', 'N']: count_N += 1 size_N.append((size, count_N / size)) # In[7]: print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes), np.percentile(sizes, 10), np.percentile(sizes, 90)) # In[8]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt small_split = 4800 large_split = 540000 fig, axs = plt.subplots(1, 3, figsize=(16, 9), squeeze=False) xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split]) axs[0, 0].plot(xs, ys, '.') axs[0, 0].set_ylim(-0.1, 3.5) xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split]) axs[0, 1].plot(xs, ys, '.') axs[0, 1].set_xlim(small_split, large_split) xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split]) axs[0, 2].plot(xs, ys, '.') axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12) axs[0, 1].set_xlabel('Contig size', fontsize=12) fig.suptitle('Fraction of Ns per contig size', fontsize=26) # In[ ]: