def read_hist(filename):
return numpy.loadtxt(filename)
transcripts_all = read_hist('transcript-reads.fa.hist')
transcripts_dn = read_hist('transcript-reads.fa.keep.hist')
transcripts_trin = read_hist('transcript-reads.fa.normalized_K25_C20_pctSD100.fa.hist')
print 'Unnormalized data:', transcripts_all[:,2][-1], 'total k-mers'
print 'diginorm normalization:', transcripts_dn[:,2][-1], 'total k-mers remaining'
print 'Trinity normalization:', transcripts_trin[:,2][-1], 'total k-mers remaining'
Unnormalized data: 3163778.0 total k-mers diginorm normalization: 1976319.0 total k-mers remaining Trinity normalization: 190815.0 total k-mers remaining
plot(transcripts_all[:,0], transcripts_all[:,1], label='all data')
plot(transcripts_dn[:,0], transcripts_dn[:,1], label='diginorm')
plot(transcripts_trin[:,0], transcripts_trin[:,1], label='Trinity norm')
title('k-mer abundance plot of normalized data')
xlabel('k-mer abundance')
ylabel('N of k-mers with that abundance')
legend()
axis(xmax=40)
savefig('normhist.png')
plot(transcripts_all[:,0], transcripts_all[:,1], label='all data')
plot(transcripts_dn[:,0], transcripts_dn[:,1], label='diginorm')
plot(transcripts_trin[:,0], transcripts_trin[:,1], label='Trinity norm')
title('k-mer abundance plot of normalized data (zoom)')
xlabel('k-mer abundance')
ylabel('N of k-mers with that abundance')
legend()
axis(xmax=40, ymax=20000)
savefig('normhistzoom.png')
plot(transcripts_all[:,0], transcripts_all[:,1] / transcripts_all[:,2][-1], label='all data')
plot(transcripts_dn[:,0], transcripts_dn[:,1] / transcripts_dn[:,2][-1], label='diginorm')
plot(transcripts_trin[:,0], transcripts_trin[:,1] / transcripts_trin[:,2][-1], label='Trinity norm')
title('k-mer abundance plot of normalized data (zoom)')
xlabel('k-mer abundance')
ylabel('Fraction of total k-mers with that abundance')
legend()
axis(xmax=30, ymax=0.3)
savefig('normhistzoom.png')
missing_all = numpy.loadtxt('transcripts.fa.hist')[0][1]
missing_dn = numpy.loadtxt('transcripts.fa.diginorm.hist')[0][1]
missing_trin = numpy.loadtxt('transcripts.fa.trinnorm.hist')[0][1]
print 'Missing', missing_all, 'true k-mers in the sequence reads'
print 'Missing', missing_dn, 'true k-mers in the diginorm reads'
print 'Missing', missing_trin, 'true k-mers in the Trinity norm reads'
print 'Ratio between dn and trin', (missing_trin - missing_all) / (missing_dn - missing_all)
Missing 96.0 true k-mers in the sequence reads Missing 103.0 true k-mers in the diginorm reads Missing 363.0 true k-mers in the Trinity norm reads Ratio between dn and trin 38.1428571429
1976319 / 3163778.0
0.624670567909632
190815.0 / 3163778.0
0.06031238601444223