from collections import Counter from matplotlib import pylab as plt dataset_name = 'some_name' dataset_outfolder = 'path/to/some/folder' # older software versions files1 = !find /path/to/rundata -name "*filtered_subreads.fasta" # 2.0 and up files2 = !find /path/to/rundata -name "*subreads.fasta" inputs = [] inputs.append(["Dataset name 1", files1]) inputs.append(["Dataset name 2", files2]) def N50(numlist): """ Abstract: Returns the N50 value of the passed list of numbers. Usage: N50(numlist) Based on the definition from this SEQanswers post http://seqanswers.com/forums/showpost.php?p=7496&postcount=4 (modified Broad Institute's definition https://www.broad.harvard.edu/crd/wiki/index.php/N50) See SEQanswers threads for details: http://seqanswers.com/forums/showthread.php?t=2857 http://seqanswers.com/forums/showthread.php?t=2332 """ numlist.sort(reverse = True) s = sum(numlist) limit = s * 0.5 for l in numlist: s -= l if s <= limit: return l def load_data(files, max_reads): """Parses the subreads fasta file and collects the length of the longest subread for each ZMW ('read of insert')""" lengths = {} i = 0 for fname in files: with open(fname, 'r') as fh: for line in fh: line = line.strip().split()[0] if line.startswith('>'): i += 1 if i > max_reads: break [name, zmw, coord] = line[1:].split('/') start, stop = coord.split('_') read_len = int(stop) - int(start) read_name = name + '_' + zmw # add length if no entry yet, otherwise use longest length # 'lengths.get(read_name,0)' returns either the previously recorded length for this read, or 0 lengths[read_name] = max(lengths.get(read_name,0), read_len) return lengths.values() def rel_cumsum(lengths): """Determines the lengths of the reads sorted from short to long, and the fracttion of the cumualtive sum of reads at that length. Needed for plotting """ c = Counter(lengths) total = sum(lengths) csum = 0 cum_len = [] sort_len = [] for k in sorted(c.iterkeys()): csum = csum + k * c[k] cum_len.append((total_len - csum)/float(total_len)) sort_len.append(k) return sort_len, cum_len max_reads = 1E30 #allows for loading a subset by using a smaller number all_lengths = {} for indata in inputs: libname, fnames = indata print libname lengths = load_data(fnames, max_reads) all_lengths[libname] = lengths N50s = {} print "Library\t# of reads\tAve. length\tLongest subread N50\tLongest" for indata in inputs: libname, fname = indata lib_N50 = N50(all_lengths[libname]) N50s[libname] = lib_N50 print "%s\t%i\t%i\t%i\t%i" %(libname, len(all_lengths[libname]), sum(all_lengths[libname])/len(all_lengths[libname]), lib_N50, max(all_lengths[libname])) num_points = -1 # '-1' for all for indata in inputs: libname, fname = indata y,binEdges=np.histogram(all_lengths[libname][0:num_points],bins=100) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) plt.plot(bincenters,y,'-', label = libname) legend() plt.xlim(0,20000) plt.xlabel('Longest subread length') plt.ylabel('Count') plt.savefig(dataset_outfolder + '/' + dataset_name + '_length_dist.pdf') for indata in inputs: libname, fname = indata total_len = sum(all_lengths[libname]) x, y = rel_cumsum(all_lengths[libname]) plt.plot(x, [100*i for i in y], label = libname) plt.plot([N50s[libname],N50s[libname]],[0,50], ls='dashed', color = '0.75') plt.plot([0,N50s[libname]],[50,50], ls='dashed', color = '0.75') legend() plt.xlim(0,20000) plt.title('Fraction of bases at different length cutoffs') plt.xlabel('Minimum length') plt.ylabel('Percentage of total bases') plt.savefig(dataset_outfolder + '/' + dataset_name + '_cumul_length_dist.pdf')