import synapseclient
import os
import sys
import yaml
import pandas
import re
import matplotlib.pyplot as plt
#adding the lib path
sys.path.append('%s/../lib' % os.getcwd())
#internal modules
import utils
%pylab inline
Populating the interactive namespace from numpy and matplotlib
#login
syn = synapseclient.login()
Welcome, Abhishek Pratap!
#SYN IDS
RNA_SEQ_DATA = 'syn2280648'
#dirs
qc_dir = '/Volumes/work/DAT_115__RA_Challenge/Data/qc_plots/'
#get the list of bam files for the project
rnaSeq_details_file = '/Volumes/work/DAT_115__RA_Challenge/Documents/RNA_Seq_Data_generation.rtf'
bam_local_dir = '/Volumes/work/DAT_115__RA_Challenge/Data/'
bams = utils.get_FilesList(bam_local_dir,pattern='*.bam')
[get_FilesList]: Found 60 files at /Volumes/work/DAT_115__RA_Challenge/Data/
##push bams as links to synapse
syn_rnaSeq_details= syn.store(synapseclient.File(rnaSeq_details_file,name='RNA Seq Data Generation', parent=RNA_SEQ_DATA)
,forceVersion=False)
for count,bam in enumerate(bams):
#read the bam stats
bamStats_file = os.path.dirname(bam) + '/' + os.path.basename(bam).replace('bam','bamStats')
bamStats = yaml.load(open(bamStats_file,'r'))
#create a synapse file
syn_bamFile = synapseclient.File(bam, synapseStore=False, parent= RNA_SEQ_DATA,
name = os.path.basename(bam), annotations = bamStats)
#add additional metadata
syn_bamFile['fileType'] = 'bam'
syn_bamFile['sampleType'] = 'mRNA'
syn_bamFile = syn.store(syn_bamFile, used = [syn_rnaSeq_details.id],
forceVersion = False)
#generate plots for read count summarization
#gather all bamstat per bam file
bamStats = []
for count,bam in enumerate(bams):
#read the bam stats
bamStats_file = os.path.dirname(bam) + '/' + os.path.basename(bam).replace('bam','bamStats')
bamStats.append(yaml.load(open(bamStats_file,'r')))
#create a pandas df
df = pandas.DataFrame.from_dict(bamStats)
#get only the patient number and set a df index
df.index = df.bamFile.apply(lambda x: re.sub(r'_.*bam','',x))
#raw read count per sample
ax = df.reads.plot(kind="bar",figsize=(15,8),color="#8DA0CB")
ax.set_title('RA Challenge RNA-Seq data : raw read count / sample')
ax.set_xlabel('patient id')
ax.set_ylabel('#reads (in 100 millions)')
readCount_plot = qc_dir + '/raw_read_count.png'
plt.savefig(readCount_plot,bbox_inches='tight')
df['%map'] = df.mapped.astype(float) / (df.reads)
df['%QC_failed'] = df.QC_failed.astype(float) / df.reads
df['%dups'] = df.PCR_optical_dups.astype(float) / df.reads
ax = df[['%map','%QC_failed','%PCR_optical_dups']].plot(figsize=(15,8))
ax.set_title('RA Challenge RNA-Seq data : reads mapping % / sample')
ax.set_xlabel('patient id')
ax.set_ylabel('percent')
#still feel this is a hack: shud be a better way to specify xticks and labels
ax.set_xticks(range(0,60,1))
ax.set_xticklabels(df.index)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xticks(rotation=90)
mapPercent_plot = qc_dir + '/mapping_percent.png'
plt.savefig(mapPercent_plot,bbox_inches='tight')
#Push count data along with provenance
counts_file = "/Users/apratap/Desktop/RA_Data_Expression_Counts.txt"
#counting and analysis script URL
script = 'https://raw.github.com/apratap/apRs/master/RA_Data_Exp_Analysis.R'
#get bam synids used
bam_synIds = syn.query('select id from entity WHERE parentId == "syn2280648" and fileType =="bam"')['results']
bam_synIds = pandas.DataFrame.from_dict(bam_synIds)
bam_synIds = list(bam_synIds['entity.id'])
syn_counts_file = synapseclient.File(counts_file,parent='syn2290931')
syn_counts_file = syn.store(syn_counts_file,used=bam_synIds,executed=[script])
..,!! Upload completed in 50 seconds.
File(path='/Users/apratap/Desktop/RA_Data_Expression_Counts.txt', uri=u'/repo/v1/entity/syn2295027', files=['RA_Data_Expression_Counts.txt'], id=u'syn2295027', accessControlList=u'/repo/v1/entity/syn2295027/acl', parentId=u'syn2290931', synapseStore=True, versionUrl=u'/repo/v1/entity/syn2295027/version/1', createdBy=u'Abhishek Pratap', versionNumber=1, versions=u'/repo/v1/entity/syn2295027/version', externalURL=None, versionLabel=u'1', entityType=u'org.sagebionetworks.repo.model.FileEntity', modifiedOn=u'2013-11-06T17:02:53.245Z', etag=u'0f1eb4b8-d166-4a7b-8efa-ecb2a6710218', cacheDir='/Users/apratap/Desktop', creationDate=u'1383757367250', modifiedBy=u'Abhishek Pratap', etag=u'1db20eae-a9f5-4ad0-9f8f-cd2c845991a9', annotations=u'/repo/v1/entity/syn2295027/annotations', uri=u'/entity/syn2295027/annotations', concreteType='org.sagebionetworks.repo.model.FileEntity', createdOn=u'2013-11-06T17:02:47.250Z', md5=None, dataFileHandleId=u'199011', fileSize=None, name=u'RA_Data_Expression_Counts.txt')
list(bam_synIds['entity.id'])