import synapseclient
import os
import sys
import itertools
#adding the lib path
sys.path.append('/Users/abhishek/dev/appys/lib/')
#internal modules
import utils
! ls -l $synapseclient.__file__
#login to Synapse
syn = synapseclient.Synapse()
syn.login()
#settings
syn_forceVersion = False
syn_STORE = False
#syn ids
project_folder = 'syn2275628'
scripts_folder = 'syn2276110'
tophat_used = 'syn2243144'
cufflinks_used = 'syn2243146'
fastqs_folder = 'syn2276483'
bams_folder = 'syn2276484'
expressions_file_folder = 'syn2276109'
#gather data
basedata_dir = "/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/data/"
read1_fastqs = utils.get_FilesList(basedata_dir, pattern="*LIB*R1.fastq.bz2")
read2_fastqs = utils.get_FilesList(basedata_dir, pattern="*LIB*R2.fastq.bz2")
#push scripts
#upload scripts to cufflinks
tophatScript = syn.store(synapseclient.File("/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/runTophat.sh",
name='tophat script', parent=scripts_folder),forceVersion=syn_forceVersion)
tophat = syn.get(tophat_used,downloadFile=False,version=1)
cufflinksScript = syn.store(synapseclient.File("/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/runCufflinks.sh",
name='cufflinks script', parent=scripts_folder),forceVersion=syn_forceVersion)
def create_fastq_page_markdown(fastq,attachments):
fastq_name = os.path.basename(fastq)
markdown = '## QC Report \n'
for image in attachments:
image_file = os.path.basename(image)
markdown += 'Metric %s' % image_file.replace('.png','').replace('_',' ')
markdown += '${image?fileName=%s&align=none&scale=50}' % image_file
return(markdown)
def readAlignSummary(fileName):
with open(fileName) as f:
f.readline() #removes first line
numberReads_1 = int(f.readline().split(':')[1])
mappedReads_1 = int(f.readline().split(':')[1].split('(')[0])
f.readline()
f.readline()
numberReads_2 = int(f.readline().split(':')[1])
mappedReads_2 = int(f.readline().split(':')[1].split('(')[0])
f.readline()
percentMapped = f.readline().split('%')[0]+'%'
return numberReads_1, mappedReads_1,numberReads_2, mappedReads_2, percentMapped
for read1_fastq,read2_fastq in itertools.izip(read1_fastqs,read2_fastqs):
read1_prefix = os.path.basename(read1_fastq).replace('_R1.fastq.bz2','')
read2_prefix = os.path.basename(read2_fastq).replace('_R2.fastq.bz2','')
#sanity check for read 1 and read 2 of same fastq/ sample
if read1_prefix != read2_prefix:
print('not read 1 and read 2.. prefix different')
print ('%s != %s' % (read1_prefix,read2_preifx))
break
lib,sample,index,lane = read1_prefix.split('_')
fastq_annotations = {'lib':lib, 'sample':sample, 'index':index,
'lane':lane, 'dataType':'mRNA'}
tophat_dir = os.path.dirname(read1_fastq) + '/tophat_out/'
align_summary_file = tophat_dir + 'align_summary.txt'
mapped_bam_file = tophat_dir + 'accepted_hits.bam'
cufflinks_dir = os.path.dirname(read1_fastq) + '/cufflinks/'
genes_fpkm_file = cufflinks_dir + 'genes.fpkm_tracking'
#mapping stats
numberReads_1, mappedReads_1, numberReads_2, mappedReads_2, percentMapped = readAlignSummary(align_summary_file)
mapping_annotations = {'nReads_1' : numberReads_1, 'nReads_2' : numberReads_2,
'mappedReads_1' : mappedReads_1, 'mappedReads_2' : mappedReads_2,
'percentMapped' : percentMapped }
mapping_annotations = dict(fastq_annotations.items() + mapping_annotations.items())
################
#Upload files to synapse
################
######
#fastqs
#######
#add read 1
syn_read1_fastq = synapseclient.File(read1_fastq, parent= fastqs_folder,
synapseStore = syn_STORE,
name = sample+'_read1.fastq', annotations=fastq_annotations)
syn_read1_fastq['read'] = 1
syn_read1_fastq = syn.store(syn_read1_fastq ,forceVersion = syn_forceVersion)
#add read 2
syn_read2_fastq = synapseclient.File(read2_fastq, parent= fastqs_folder,
synapseStore = syn_STORE,
name = sample+'_read2.fastq', annotations=fastq_annotations)
syn_read2_fastq['read'] = 2
syn_read2_fastq = syn.store(syn_read2_fastq ,forceVersion = syn_forceVersion)
#########
#push QC images to a wiki
#########
#get list of files to attach
fastqc_folder_read1 = os.path.dirname(read1_fastq) + '/FastQC/' + read1_prefix + '_R1_fastqc/'
fastqc_read1_images = utils.get_FilesList(fastqc_folder_read1+'/Images',pattern="*png")
#create the wiki
wiki_read1_fastq = synapseclient.Wiki(title ='QC Report %s' % sample,
owner = syn_read1_fastq.id,
attachments = fastqc_read1_images)
#create and push the markdown
wiki_read1_fastq.markdown = create_fastq_page_markdown(read1_fastq,fastqc_read1_images)
wiki_read1_fastq = syn.store(wiki_read1_fastq)
#get list of files to attach
fastqc_folder_read2 = os.path.dirname(read2_fastq) + '/FastQC/' + read2_prefix + '_R2_fastqc/'
fastqc_read2_images = utils.get_FilesList(fastqc_folder_read2+'/Images',pattern="*png")
#create the wiki
wiki_read2_fastq = synapseclient.Wiki(title ='QC Report %s' % sample,
owner = syn_read2_fastq.id,
attachments = fastqc_read2_images)
#create and push the markdown
wiki_read2_fastq.markdown = create_fastq_page_markdown(read2_fastq,fastqc_read2_images)
wiki_read2_fastq = syn.store(wiki_read2_fastq)
#######
#mapped bams
#######
mapped_bam = synapseclient.File(mapped_bam_file, parent=bams_folder,
annotations=mapping_annotations,
name=sample+'.bam',
synapseStore = syn_STORE)
mapped_bam['fileType'] = 'bam'
mapped_bam['bamType'] = 'mapped'
mapped_bam = syn.store(mapped_bam , used=[syn_read1_fastq,syn_read2_fastq],
executed=[tophat, tophatScript],
forceVersion = syn_forceVersion)
######
#expression counts : genes
######
genes_fpkm = synapseclient.File(genes_fpkm_file, parent=expressions_file_folder,
annotations=mapping_annotations,
name = sample+'_genes.fpkm_tracking',
synapseStore = True)
genes_fpkm['fileType'] = 'genes_fpkm'
genes_fpkm = syn.store(genes_fpkm, used = [mapped_bam],
executed = [cufflinks_used,cufflinksScript],
forceVersion = syn_forceVersion)
#merging the epxression calls
mergeScript = syn.store(synapseclient.File("/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/scripts/mergeExpression.py",
name="merge Expression script"
, parent=scripts_folder),forceVersion=syn_forceVersion)
fpkm_files_used = [l['entity.id'] for l in list(syn.chunkedQuery("select id from entity where parentId=='syn2276109'"))]
#push the merged data
mergedExp = synapseclient.File("/Users/abhishek/apratap_bt/projects/PCBC_integrative_analysis/data/summarized_expression_calls.tsv",
name = "Summarized Expression Calls",
parent = project_folder)
mergedExp = syn.store(mergedExp, used = fpkm_files_used,
executed = mergeScript
)
..,!!! Upload completed in 6 seconds.
#push the heatmap / correlation script
geneExp_analysis_script = syn.store( synapseclient.File('/Users/abhishek/apratap_bt/dev/apRs/
?synapseclient.File