!rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz -O atroparvus.fa.gz
--2015-07-05 17:14:17-- ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz => ‘gambiae.fa.gz’ Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228 Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected. Logging in as anonymous ... Logged in! ==> SYST ... done. ==> PWD ... done. ==> TYPE I ... done. ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done. ==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806 ==> PASV ... done. ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done. Length: 81591806 (78M) (unauthoritative) agambiae.CHROMOSOME 100%[=====================>] 77.81M 583KB/s in 3m 54s 2015-07-05 17:18:12 (341 KB/s) - ‘gambiae.fa.gz’ saved [81591806] --2015-07-05 17:18:12-- https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz Resolving www.vectorbase.org (www.vectorbase.org)... 129.74.255.228 Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-atroparvus-EBRO_SCAFFOLDS_AatrE1.fa.gz [following] --2015-07-05 17:18:14-- https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-atroparvus-EBRO_SCAFFOLDS_AatrE1.fa.gz Reusing existing connection to www.vectorbase.org:443. HTTP request sent, awaiting response... 200 OK Length: 60941709 (58M) [application/x-gzip] Saving to: ‘atroparvus.fa.gz’ atroparvus.fa.gz 100%[=====================>] 58.12M 282KB/s in 2m 54s 2015-07-05 17:21:09 (341 KB/s) - ‘atroparvus.fa.gz’ saved [60941709/60941709]
from __future__ import division
import gzip
from Bio import SeqIO, SeqUtils
gambiae_name = 'gambiae.fa.gz'
atroparvus_name = 'atroparvus.fa.gz'
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
for rec in recs:
print(rec.description)
#Do not do this with atroparvus
chromosome:AgamP3:2L:1:49364325:1 chromosome 2L chromosome:AgamP3:2R:1:61545105:1 chromosome 2R chromosome:AgamP3:3L:1:41963435:1 chromosome 3L chromosome:AgamP3:3R:1:53200684:1 chromosome 3R chromosome:AgamP3:UNKN:1:42389979:1 chromosome UNKN chromosome:AgamP3:X:1:24393108:1 chromosome X chromosome:AgamP3:Y_unplaced:1:237045:1 chromosome Y_unplaced
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
chrom_Ns = {}
chrom_sizes = {}
for rec in recs:
chrom = rec.description.split(':')[2]
if chrom in ['UNKN', 'Y_unplaced']:
continue
chrom_Ns[chrom] = []
on_N = False
curr_size = 0
for pos, nuc in enumerate(rec.seq):
if nuc in ['N', 'n']:
curr_size += 1
on_N = True
else:
if on_N:
chrom_Ns[chrom].append(curr_size)
curr_size = 0
on_N = False
if on_N:
chrom_Ns[chrom].append(curr_size)
chrom_sizes[chrom] = len(rec.seq)
for chrom, Ns in chrom_Ns.items():
size = chrom_sizes[chrom]
print('%s (%s): %%Ns (%.1f), num Ns: %d, max N: %d' % (chrom, size, 100 * sum(Ns) / size, len(Ns), max(Ns)))
2L (49364325): %Ns (1.7), num Ns: 957, max N: 28884 3R (53200684): %Ns (1.8), num Ns: 1128, max N: 24292 X (24393108): %Ns (4.1), num Ns: 1287, max N: 21132 2R (61545105): %Ns (2.3), num Ns: 1658, max N: 36427 3L (41963435): %Ns (2.9), num Ns: 1272, max N: 31063
import numpy as np
#SeqUtils
recs = SeqIO.parse(gzip.open(atroparvus_name), 'fasta')
sizes = []
size_N = []
for rec in recs:
size = len(rec.seq)
sizes.append(size)
count_N = 0
for nuc in rec.seq:
if nuc in ['n', 'N']:
count_N += 1
size_N.append((size, count_N / size))
print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
np.percentile(sizes, 10), np.percentile(sizes, 90))
(1371, 8304.0, 163596.00656455141, 20238125, 1004, 1563.0, 56612.0)
%matplotlib inline
import matplotlib.pyplot as plt
small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), squeeze=False)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
axs[0, 0].set_ylim(-0.1, 3.5)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
fig.suptitle('Fraction of Ns per contig size', fontsize=26)
<matplotlib.text.Text at 0x7f67d4b1b910>