# install some software to help evaluate assemblies
!pip install git+https://github.com/ged-lab/screed.git
Downloading/unpacking git+https://github.com/ged-lab/screed.git Cloning https://github.com/ged-lab/screed.git to /tmp/pip-KmDuX1-build Running setup.py egg_info for package from git+https://github.com/ged-lab/screed.git Installing collected packages: screed Running setup.py install for screed Successfully installed screed Cleaning up...
cd /mnt
/mnt
ls
ecoli-reads-5m-dn-orphan.fa.gz g2s.23/ g2s.39/ g.35/ gs.21/ gs.37/ ecoli-reads-5m-dn-paired.fa.gz g2s.25/ g2s.41/ g.37/ gs.23/ gs.39/ g.21/ g2s.27/ g2s.43/ g.39/ gs.25/ gs.41/ g.23/ g2s.29/ g2s.45/ g.41/ gs.27/ gs.43/ g.25/ g2s.31/ g2s.47/ g.43/ gs.29/ gs.45/ g.27/ g2s.33/ g2s.49/ g.45/ gs.31/ gs.47/ g.29/ g2s.35/ g.31/ g.47/ gs.33/ gs.49/ g2s.21/ g2s.37/ g.33/ g.49/ gs.35/ lost+found/
# some Python code to calculate the average contig size, given a lower cutoff.
import os.path, screed
def get_avg_contig_size(dirname, cutoff=200):
contigs_file = os.path.join(dirname, 'contigs.fa')
sizes = []
if not os.path.exists(contigs_file):
return []
for record in screed.open(contigs_file):
length = len(record.sequence)
sizes.append(length)
return sum(sizes) / float(len(sizes))
# more Python code to calculate the averages for all of a particular pattern of assemblies
import glob
def calc_avgs(pattern):
dirnames = sorted(glob.glob(pattern))
klist = []
avgs = []
for dir in dirnames:
k = dir.split('.')[1]
k = int(k)
avg = get_avg_contig_size(dir)
if avg:
klist.append(k)
avgs.append(avg
)
return klist, avgs
g_x, g_y = calc_avgs('g.*')
gs_x, gs_y = calc_avgs('gs.*')
g2s_x, g2s_y = calc_avgs('g2s.*')
g2_x, g2_y = calc_avgs('g2.*')
plot(g_x, g_y, label='pe data w/scaffolding')
plot(gs_x, gs_y, label='pe, no scaffolding')
plot(g2s_x, g2s_y, label='pe + orphans, no scaffolding')
plot(g2_x, g2_y, label='pe + orphans, scaffolding')
xlabel('k value')
ylabel('average contig size for contigs > 200')
axis(xmax=62)
legend()
savefig('/tmp/comparison.png')
g_x
[]