ls /data import csv def load_results(filename): r = csv.DictReader(open(filename, 'rb'), delimiter='\t') results = {} for row in r: gene = row['gene_id'] results[gene] = row return results sample1 = load_results('/data/1.fq.genes.results') # these are replicates sample2 = load_results('/data/2.fq.genes.results') sample1v2 = [] for k in sample1: s1 = float(sample1[k]['FPKM']) s2 = float(sample2[k]['FPKM']) if s1 == 0 or s2 == 0: continue else: sample1v2.append((s1, s2)) sample1v2 = numpy.array(sample1v2) ax = plot(sample1v2[:,0], sample1v2[:,1], 'bo', alpha=0.1) ax = axes() ax.set_yscale('log') ax.set_xscale('log') r = numpy.corrcoef(sample1v2[:,0], sample1v2[:,1])[0,1] r**2 ratio = [] for k in sample1: s1 = float(sample1[k]['FPKM']) s2 = float(sample2[k]['FPKM']) if s1 == 0 or s2 == 0: continue else: ratio.append(math.log(s1 / s2, 2)) hist(ratio, bins=100, range=(-2,2)) x = [1,2,3,4] y = [3,5,7,10] # 10, not 9, so the fit isn't perfect fit = polyfit(x,y,1) print fit fit_fn = poly1d(fit) # fit_fn is now a function which takes in x and returns an estimate for y plot(x,y, 'yo', x, fit_fn(x), '--k') xlim(0, 5) ylim(0, 12) log_1v2 = numpy.log(sample1v2) plot(log_1v2[:,0], log_1v2[:,1], '.', alpha=0.1) fit = polyfit(log_1v2[:,0], log_1v2[:,1], 1) fit_fn = poly1d(fit) plot(log_1v2[:,0], fit_fn(log_1v2[:,0]), 'r-') sample1 = load_results('/data/1.fq.genes.results') sample7 = load_results('/data/7.fq.genes.results') # this is a different time point! sample1v7 = [] for k in sample1: s1 = float(sample1[k]['FPKM']) s2 = float(sample7[k]['FPKM']) if s1 == 0 or s2 == 0: continue else: sample1v7.append((s1, s2)) sample1v7 = numpy.array(sample1v7) ax = plot(sample1v7[:,0], sample1v7[:,1], 'go', alpha=0.1) ax = axes() ax.set_yscale('log') ax.set_xscale('log') xlabel("sample 1 expression") ylabel("sample 2 expression") ratio2 = [] for k in sample1: s1 = float(sample1[k]['FPKM']) s2 = float(sample7[k]['FPKM']) if s1 == 0 or s2 == 0: continue else: ratio2.append(math.log(s1 / s2, 2)) hist(ratio2, bins=100)