# process a small subset of the data if True debug = False # EBI accesions to process accessions = ['ERP003819', 'ERP003822', 'ERP003820', 'ERP003821', 'ERP005367', 'ERP005366', 'ERP005361', 'ERP005362', 'ERP005651', 'ERP005821', 'ERP005949', 'ERP006349', 'ERP008512', 'ERP008604', 'ERP008617'] # set the path to the Greengenes 13_5 97% OTU tree reference_rep_set = '/shared/gg_13_8_otus/rep_set/97_otus.fasta' # e.g., path to 97_otus.fasta from Greengenes reference_taxonomy = '/shared/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt' # e.g., path to 97_otu_taxonomy.txt from Grengenes reference_tree = '/shared/gg_13_8_otus/trees/97_otus.tree' # e.g., path to 97_otus.tree from Grengenes %run cluster_utils.ipy import os import locale import datetime import shutil from functools import partial from tempfile import mktemp from collections import defaultdict from cPickle import loads from itertools import izip from ftplib import FTP from tarfile import open as tar_open from gzip import open as gz_open from glob import glob from IPython.lib.display import FileLink from americangut.util import fetch_study, trim_fasta, concatenate_files from americangut.results_utils import (check_file, get_path, get_repository_dir, stage_static_files, parse_identifying_data, filter_mapping_file, clean_and_reformat_mapping, parse_previously_printed, bootstrap_result, MissingFigure, construct_svg_smash_commands, construct_phyla_plots_cmds, per_sample_taxa_summaries, construct_bootstrap_and_latex_commands, count_unique_sequences_per_otu, write_bloom_fasta, harvest) from americangut.util import count_samples, count_seqs, count_unique_participants locale.setlocale(locale.LC_ALL, 'en_US') # get the current absolute path current_dir = os.path.abspath('.') # get the path to the American Gut repository repo_dir = get_repository_dir() # create a place to do work prj_name = "americangut_results_r1-14" working_dir = os.path.join(current_dir, prj_name) os.makedirs(prj_name) # path wrappers get_relative_new_path = lambda x: os.path.join(working_dir, x) get_relative_existing_path = partial(get_path, working_dir) # set the number of processors parallel tasks will use NUM_PROCS = 100 NUM_PROCS_OTU_PICKING = 64 # bootstrap the submit method submit = partial(submit, prj_name) submit_smr = lambda x: submit_qsub(x, prj_name, queue='mem512gbq', extra_args='-l ncpus=64 -l walltime=72:00:00') # make sure our reference files exist check_file(reference_rep_set) check_file(reference_taxonomy) check_file(reference_tree) if debug: sequence_files = [os.path.join(repo_dir, 'data', 'AG_debug', 'test_seqs.fna'), os.path.join(repo_dir, 'data', 'AG_debug', 'test_seqs_2.fna')] mapping_files = [os.path.join(repo_dir, 'data', 'AG_debug', 'test_mapping.txt')] else: sequence_files = [get_relative_new_path(acc + '.fna') for acc in accessions] mapping_files = [get_relative_new_path(acc + '.txt') for acc in accessions] params_file = get_relative_new_path('sortmerna_pick_params.txt') with open(params_file, 'w') as f: f.write("pick_otus:otu_picking_method sortmerna\n") f.write("pick_otus:similarity 0.97\n") f.write("pick_otus:threads %d\n" % NUM_PROCS_OTU_PICKING) scripts = { 'Merge OTU Tables':'merge_otu_tables.py -i %(input_a)s,%(input_b)s -o %(output)s', 'Single Rarifaction':'single_rarefaction.py -i %(input)s -o %(output)s -d %(depth)s', 'Parallel Beta Diversity':'parallel_beta_diversity.py -i %(input)s -o %(output)s -X %(job_prefix)s -O %(num_jobs)s -t %(gg97_tree)s', 'Principal Coordinates':'principal_coordinates.py -i %(input)s -o %(output)s', 'Merge Mapping Files':'merge_mapping_files.py -m %(input_a)s,%(input_b)s -o %(output)s', 'Filter Samples':'filter_samples_from_otu_table.py -i %(input)s -o %(output)s --sample_id_fp=%(sample_id_fp)s', 'Summarize OTU by Category':'summarize_otu_by_cat.py -m %(mapping)s -o %(output)s -n -i %(otu_table)s -c %(category)s', 'Filter Distance Matrix':'filter_distance_matrix.py -i %(input)s -o %(output)s --sample_id_fp=%(sample_ids)s', 'Summarize Taxa':'summarize_taxa.py -i %(input)s -o %(output)s -L %(level)s', 'Summarize Taxa Mapping':'summarize_taxa.py -i %(input)s -o %(output)s -L %(level)s -m %(mapping)s', 'Taxonomy Comparison':'taxonomy_comparison.py -i %(input)s -m %(mapping)s -l %(level)s -o %(output)s -c %(list_of_categories)s', 'Make Emperor':'make_emperor.py -i %(input)s -o %(output)s -m %(mapping)s', 'SVG Smash':'replace_svg_object.py -i %(input)s -o %(output)s --prefix %(prefix)s --sample_id=%(sample_id)s', 'Make Phyla Plots':"make_phyla_plots_AGP.py -i %(input)s -m %(mapping)s -o %(output)s -c '%(categories)s' -s %(samples)s %(debug)s", 'OTU Significance':"generate_otu_signifigance_tables_AGP.py -i %(input)s -o %(output)s -s %(samples)s", 'Create Titles':'create_titles.py -m %(mapping)s -f', 'SVG to PDF':'inkscape -z -D -f %(input)s -A %(output)s', 'Format Template':'format_file.py -i %(template)s -k %(keys_for_replace)s -v %(values_for_replace)s -K %(keys_for_insert)s -V %(values_for_insert)s -o %(output)s', 'To PDF':'module load texlive_2013; cd %(path)s; lualatex %(input)s', 'PDF Smash':'gs -r150 -q -sPAPERSIZE=ledger -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dFIXEDMEDIA -dPDFFitPage -dCompatibilityLevel=1.4 -sOutputFile=%(output)s -c 100000000 setvmthreshold -f %(pdfs)s', 'gunzip': 'gunzip -f %(input)s', 'Pick Closed Reference OTUs': 'pick_closed_reference_otus.py -i %(input)s -o %(output)s -r %(reference)s -t %(taxonomy)s ' + '-p ' + params_file, 'Filter Sequences': 'filter_fasta.py -f %(input)s -m %(otus)s -n -o %(output)s', 'QIIME Charts': 'qiimecharts run-config -i %(input)s', 'Filter Sequences to Fecal': 'filter_fasta.py -f %(input)s -o %(output)s --mapping_fp %(mapping)s --valid_states "%(states)s"', 'AGPlots': 'make_plots.py -m %(mapping)s -t %(taxa)s -f pdf -s %(identified)s -c %(metadata_cat)s -v %(metadata_val)s -o %(output_prefix)s -k %(key_taxa)s', 'Filter Distance Matrix by Metadata':'filter_distance_matrix.py -i %(input)s -o %(output)s -m %(mapping)s -s %(states)s', 'Beta Diversity Comparison':'beta_sample_rarefaction.py -i %(inputs)s -o %(output)s -n 10 --y_max 0.7 -t %(title)s -l %(labels)s -y %(ylabel)s -v' } # working directory file paths for tables and metadata. # ag -> American Gut # pgp -> Personal Genome Project # hmp -> Human Microbiome Project # gg -> Global Gut # # _t_ -> table # _m_ -> mapping file # # 100nt -> trimmed to the first 100 nucleotides stage_static_files('fecal', working_dir, debug=debug) bloom_seqs = get_relative_existing_path('BLOOM.fasta') jobs = [] for f in glob(os.path.join(working_dir, "*.biom.gz")): jobs.append(submit(scripts['gunzip'] % {'input': f})) res = wait_on(jobs) pgp_100nt_t_fp = get_relative_existing_path('PGP_100nt.biom') pgp_100nt_m_fp = get_relative_existing_path('PGP_100nt.txt') hmp_100nt_t_fp = get_relative_existing_path('HMPv35_100nt.biom') hmp_100nt_m_fp = get_relative_existing_path('HMPv35_100nt.txt') gg_100nt_t_fp = get_relative_existing_path('GG_100nt.biom') gg_100nt_m_fp = get_relative_existing_path('GG_100nt.txt') # participant static images template = get_relative_existing_path('template_gut.tex') aglogo = get_relative_existing_path('logoshape.pdf') fig1_legend = get_relative_existing_path('figure1_legend.pdf') fig2_legend = get_relative_existing_path('figure2_legend.pdf') fig2_2ndlegend = get_relative_existing_path('figure2_country_legend.pdf') fig3_legend = get_relative_existing_path('figure3_legend.pdf') fig1_ovals = get_relative_existing_path('figure1_ovals.png') fig2_ovals = get_relative_existing_path('figure2_ovals.png') fig4_overlay = get_relative_existing_path('figure4_overlay.pdf') ball_legend = get_relative_existing_path('ball_legend.pdf') title = get_relative_existing_path('youramericangutsampletext.pdf') to_obtain = [] for acc, seqs, map_ in zip(accessions, sequence_files, mapping_files): if not os.path.exists(seqs) or not os.path.exists(map_): to_obtain.append((acc, map_, seqs)) if not debug: for acc, map_, seqs in to_obtain: print "Fetching %s" % acc fetch_study(acc, map_, seqs) if len(mapping_files) == 1: mapping_fp = mapping_files[0] else: mapping_fp = get_relative_new_path('combined_mapping.txt') merge_args = {'input_a': mapping_files[0], 'input_b': mapping_files[1], 'output': mapping_fp} res = wait_on(submit(scripts['Merge Mapping Files'] % merge_args)) for m in mapping_files[2:]: merge_args = {'input_a': mapping_fp, 'input_b': m, 'output': mapping_fp} res = wait_on(submit(scripts['Merge Mapping Files'] % merge_args)) # concatenate all sequence files into one merged sequence file. This can take a while! merged_sequences_full_length_fp = get_relative_new_path('merged_sequences_full_length.fna') with open(merged_sequences_full_length_fp, 'w') as merged_seqs: concatenate_files([open(f, 'U') for f in sequence_files], merged_seqs) check_file(merged_sequences_full_length_fp) ag_just_fecal = os.path.splitext(os.path.basename(merged_sequences_full_length_fp))[0] + '-fecal.fna' filter_args = {'input': merged_sequences_full_length_fp, 'output': ag_just_fecal, 'mapping': mapping_fp, 'states': ':'.join(('BODY_SITE', 'UBERON:feces'))} wait_on(submit(scripts['Filter Sequences to Fecal'] % filter_args)) no_ext = os.path.splitext(os.path.basename(ag_just_fecal))[0] bloom_otus = ag_just_fecal + '-bloom-otus' bloom_hits = os.path.join(bloom_otus, 'sortmerna_picked_otus', no_ext + '_otus.txt') bloom_args = { 'input': ag_just_fecal, 'output': bloom_otus, 'reference': bloom_seqs, 'taxonomy': reference_taxonomy, } wait_on(submit_smr(scripts['Pick Closed Reference OTUs'] % bloom_args)) bloom_filtered = os.path.splitext(merged_sequences_full_length_fp)[0] + '-debloomed.fna' filter_args = { 'input': merged_sequences_full_length_fp, 'output': bloom_filtered, 'otus': bloom_hits} wait_on(submit(scripts['Filter Sequences'] % filter_args)) bloom_filtered_100nt = os.path.splitext(bloom_filtered)[0] + '-100nt.fna' with open(bloom_filtered, 'U') as merged_seqs, open(bloom_filtered_100nt, 'w') as merged_seqs_trimmed: trim_fasta(merged_seqs, merged_seqs_trimmed, 100) check_file(bloom_filtered_100nt) params_file = get_relative_existing_path('sortmerna_pick_params.txt') with open(params_file, 'a') as f: f.write("pick_otus.py:sortmerna_db /home/mcdonadt/ResearchWork/gg_13_8_otus/97_otus_idx\n") full_length_otus = os.path.splitext(bloom_filtered)[0] + '-otus' full_length_args = { 'input': bloom_filtered, 'output': full_length_otus, 'reference': reference_rep_set, 'taxonomy': reference_taxonomy, } trimmed_otus = os.path.splitext(bloom_filtered_100nt)[0] + '-otus' trimmed_args = { 'input': bloom_filtered_100nt, 'output': trimmed_otus, 'reference': reference_rep_set, 'taxonomy': reference_taxonomy, } jobs = [submit_smr(scripts['Pick Closed Reference OTUs'] % full_length_args), submit_smr(scripts['Pick Closed Reference OTUs'] % trimmed_args)] wait_on(jobs) ag_100nt_m_fp = get_relative_existing_path('combined_mapping.txt') ag_100nt_t_fp = get_relative_existing_path('merged_sequences_full_length-debloomed-100nt-otus/otu_table.biom') path_to_participant_data = None passwd_for_participant_data = None path_to_previously_printed = None participants = parse_identifying_data(path_to_participant_data, passwd_for_participant_data) prev_printed = parse_previously_printed(path_to_previously_printed) # new file paths ag_100nt_m_massaged_fp = get_relative_new_path('AG_100nt_massaged.txt') gg_100nt_m_massaged_fp = get_relative_new_path('GG_100nt_massaged.txt') pgp_100nt_m_massaged_fp = get_relative_new_path('PGP_100nt_massaged.txt') hmp_100nt_m_massaged_fp = get_relative_new_path('HMP_100nt_massaged.txt') # parse PGP IDs list that are in the American Gut pgp_ids_fp = get_relative_existing_path('pgp_agp_barcodes.txt') pgp_ids = [l.strip() for l in open(pgp_ids_fp) if not l.startswith('#')] # massage clean_and_reformat_mapping(open(ag_100nt_m_fp, 'U'), open(ag_100nt_m_massaged_fp, 'w'), 'body_site', 'AGP', pgp_ids=pgp_ids) clean_and_reformat_mapping(open(gg_100nt_m_fp, 'U'), open(gg_100nt_m_massaged_fp, 'w'), 'body_site', 'GG') clean_and_reformat_mapping(open(pgp_100nt_m_fp, 'U'), open(pgp_100nt_m_massaged_fp, 'w'), 'body_site', 'PGP', pgp_ids=True) clean_and_reformat_mapping(open(hmp_100nt_m_fp, 'U'), open(hmp_100nt_m_massaged_fp, 'w'), 'bodysite', 'HMP') # setup output paths, (mm -> massaged mapping) hmp_pgp_mm_fp = get_relative_new_path('HMP_PGP_100nt_massaged.txt') ag_gg_mm_fp = get_relative_new_path('AG_GG_100nt_massaged.txt') hmp_pgp_ag_gg_mm_fp = get_relative_new_path('HMP_GG_AG_PGP_100nt_massaged.txt') hmp_pgp_cmd_args = {'input_a':hmp_100nt_m_massaged_fp, 'input_b':pgp_100nt_m_massaged_fp, 'output':hmp_pgp_mm_fp} ag_gg_cmd_args = {'input_a':ag_100nt_m_massaged_fp, 'input_b':gg_100nt_m_massaged_fp, 'output':ag_gg_mm_fp} hmp_pgp_ag_gg_cmd_args = {'input_a':hmp_pgp_mm_fp, 'input_b':ag_gg_mm_fp, 'output':hmp_pgp_ag_gg_mm_fp} # merge and block until completion hmp_pgp_job = submit(scripts['Merge Mapping Files'] % hmp_pgp_cmd_args) ag_gg_job = submit(scripts['Merge Mapping Files'] % ag_gg_cmd_args) jobs = wait_on([hmp_pgp_job, ag_gg_job]) # merge and block until completion hmp_pgp_ag_gg_job = submit(scripts['Merge Mapping Files'] % hmp_pgp_ag_gg_cmd_args) jobs = wait_on(hmp_pgp_ag_gg_job) fig1_m_fp = get_relative_new_path('HMP_GG_AG_PGP_figure1.txt') fig2_m_fp = get_relative_new_path('AG_GG_fecal_figure2.txt') fig3_m_fp = get_relative_new_path('AG_fecal_figure3.txt') fig4_m_fp = get_relative_new_path('AG_fecal_figure4.txt') oral_m_fp = get_relative_new_path('AG_oral.txt') skin_m_fp = get_relative_new_path('AG_skin.txt') # for the first PCoA, keep only these 5 columns regardless of value fig1_filter_criteria = {'TITLE_ACRONYM':None, 'SIMPLE_BODY_SITE':None, 'TITLE_BODY_SITE':None, 'HMP_SITE':None, 'IS_PGP': None} # for the second PCoA, we want only the same columns but only the fecal samples fig2_filter_criteria = {'TITLE_ACRONYM':None, 'AGE':None, 'SIMPLE_BODY_SITE':lambda x: x == 'FECAL', 'COUNTRY':None, 'IS_PGP': None} # for the third PCoA, we want only two columns and only fecal samples fig3_filter_criteria = {'TITLE_ACRONYM':None, 'SIMPLE_BODY_SITE':lambda x: x == 'FECAL', 'IS_PGP': None} # for the taxonomy figure, we want to retain fecal samples and a few additional categories fig4_filter_criteria = {'TITLE_ACRONYM':None, 'AGE_CATEGORY':None, 'SEX':None, 'BMI_CATEGORY':None, 'DIET_TYPE':None, 'SIMPLE_BODY_SITE':lambda x: x == 'FECAL'} oral_filter_criteria = {'TITLE_ACRONYM':None, 'AGE_CATEGORY':None, 'SEX':None, 'BMI_CATEGORY':None, 'DIET_TYPE':None, 'SIMPLE_BODY_SITE':lambda x: x == 'ORAL'} skin_filter_criteria = {'TITLE_ACRONYM':None, 'AGE_CATEGORY':None, 'SEX':None, 'BMI_CATEGORY':None, 'DIET_TYPE':None, 'SIMPLE_BODY_SITE':lambda x: x == 'SKIN'} filter_mapping_file(open(hmp_pgp_ag_gg_mm_fp, 'U'), open(fig1_m_fp, 'w'), fig1_filter_criteria) filter_mapping_file(open(ag_gg_mm_fp, 'U'), open(fig2_m_fp, 'w'), fig2_filter_criteria) filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(fig3_m_fp, 'w'), fig3_filter_criteria) filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(fig4_m_fp, 'w'), fig4_filter_criteria) filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(oral_m_fp, 'w'), oral_filter_criteria) filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(skin_m_fp, 'w'), skin_filter_criteria) # resulting paths hmp_pgp_t_fp = get_relative_new_path("HMP_PGP_100nt.biom") ag_gg_t_fp = get_relative_new_path("AG_GG_100nt.biom") hmp_gg_ag_pgp_t_fp = get_relative_new_path("HMP_GG_AG_PGP_100nt.biom") # setup the command arguments for each call hmp_pgp_cmd_args = {'input_a':hmp_100nt_t_fp, 'input_b':pgp_100nt_t_fp, 'output':hmp_pgp_t_fp} ag_gg_cmd_args = {'input_a':ag_100nt_t_fp, 'input_b':gg_100nt_t_fp, 'output':ag_gg_t_fp} hmp_gg_ag_pgp_cmd_args = {'input_a':ag_gg_t_fp, 'input_b':hmp_pgp_t_fp, 'output':hmp_gg_ag_pgp_t_fp} # merge and block until completion hmp_pgp_job = submit(scripts['Merge OTU Tables'] % hmp_pgp_cmd_args) ag_gg_job = submit(scripts['Merge OTU Tables'] % ag_gg_cmd_args) jobs = wait_on([hmp_pgp_job, ag_gg_job]) # merge and block until completion hmp_gg_ag_pgp_job = submit(scripts['Merge OTU Tables'] % hmp_gg_ag_pgp_cmd_args) jobs = wait_on(hmp_gg_ag_pgp_job) # resulting path hmp_gg_ag_pgp_t_1k_fp = get_relative_new_path("HMP_GG_AG_PGP_100nt_even1k.biom") # setup the command arguments hmp_gg_ag_pgp_t_1k_cmd_args = {'input':hmp_gg_ag_pgp_t_fp, 'output':hmp_gg_ag_pgp_t_1k_fp, 'depth':'1000'} # rarifiy and block until completion hmp_gg_ag_pgp_t_1k_job = submit(scripts['Single Rarifaction'] % hmp_gg_ag_pgp_t_1k_cmd_args) res = wait_on(hmp_gg_ag_pgp_t_1k_job) # setup output directory bdiv_dir = lambda x: 'bdiv_' + os.path.basename(x).split('.',1)[0] hmp_gg_ag_pgp_1k_unweighted_unifrac_d = get_relative_new_path(bdiv_dir(hmp_gg_ag_pgp_t_1k_fp)) # setup beta diversity arguments prefix = 'ag2_bdiv_' hmp_gg_ag_pgp_cmd_args = {'input':hmp_gg_ag_pgp_t_1k_fp, 'output':hmp_gg_ag_pgp_1k_unweighted_unifrac_d, 'job_prefix':prefix, 'num_jobs':200, 'gg97_tree':reference_tree} # submit and wait hmp_gg_ag_pgp_bdiv_job = submit_qsub(scripts['Parallel Beta Diversity'] % hmp_gg_ag_pgp_cmd_args, prj_name, queue='memroute', extra_args='-l pvmem=16gb') res = wait_on(hmp_gg_ag_pgp_bdiv_job, additional_prefix=prefix) uw_bdiv_path = get_relative_existing_path(os.path.join(hmp_gg_ag_pgp_1k_unweighted_unifrac_d, 'unweighted_unifrac_HMP_GG_AG_PGP_100nt_even1k.txt')) w_bdiv_path = get_relative_existing_path(os.path.join(hmp_gg_ag_pgp_1k_unweighted_unifrac_d, 'weighted_unifrac_HMP_GG_AG_PGP_100nt_even1k.txt')) full_bdiv = uw_bdiv_path fig_path = lambda x: full_bdiv.rsplit('.txt',1)[0] + '-' + x + '.txt' fig1_bdiv = fig_path('fig1') fig2_bdiv = fig_path('fig2') fig3_bdiv = fig_path('fig3') fig1_cmd_args = {'input':full_bdiv, 'output':fig1_bdiv, 'sample_ids':fig1_m_fp} fig2_cmd_args = {'input':full_bdiv, 'output':fig2_bdiv, 'sample_ids':fig2_m_fp} fig3_cmd_args = {'input':full_bdiv, 'output':fig3_bdiv, 'sample_ids':fig3_m_fp} jobs = [] jobs.append(submit(scripts['Filter Distance Matrix'] % fig1_cmd_args)) jobs.append(submit(scripts['Filter Distance Matrix'] % fig2_cmd_args)) jobs.append(submit(scripts['Filter Distance Matrix'] % fig3_cmd_args)) res = wait_on(jobs) pc_path = lambda x: x.rsplit('.txt',1)[0] + '_pc.txt' # verify the expected files are present check_file(fig1_bdiv) check_file(fig2_bdiv) check_file(fig3_bdiv) fig1_pc = pc_path(fig1_bdiv) fig2_pc = pc_path(fig2_bdiv) fig3_pc = pc_path(fig3_bdiv) # setup our arguments fig1_cmd_args = {'input':fig1_bdiv, 'output':fig1_pc} fig2_cmd_args = {'input':fig2_bdiv, 'output':fig2_pc} fig3_cmd_args = {'input':fig3_bdiv, 'output':fig3_pc} # submit the jobs jobs = [] jobs.append(submit(scripts['Principal Coordinates'] % fig1_cmd_args)) jobs.append(submit(scripts['Principal Coordinates'] % fig2_cmd_args)) jobs.append(submit(scripts['Principal Coordinates'] % fig3_cmd_args)) job_results = wait_on(jobs) # quick little helper method emp_path = lambda x: x.rsplit('.txt',1)[0] + '-emp' # verify expected files are present check_file(fig1_pc) check_file(fig2_pc) check_file(fig3_pc) # setup output paths fig1_emp = emp_path(fig1_pc) fig2_emp = emp_path(fig2_pc) fig3_emp = emp_path(fig3_pc) fig3_filter = get_relative_new_path("figure3.biom") fig3_taxa = get_relative_new_path("figure3_taxa") fig3_taxa_mapping_fp = os.path.join(fig3_taxa, os.path.splitext(os.path.basename(fig3_m_fp))[0] + '_L2.txt') # setup arguments fig3_filter_args = {'input':hmp_gg_ag_pgp_t_1k_fp, 'output':fig3_filter, 'sample_id_fp':fig3_m_fp} fig3_summarize_args = {'input':fig3_filter, 'output':fig3_taxa, 'level':'2', 'mapping':fig3_m_fp} fig1_cmd_args = {'input':fig1_pc, 'output':fig1_emp, 'mapping':fig1_m_fp} fig2_cmd_args = {'input':fig2_pc, 'output':fig2_emp, 'mapping':fig2_m_fp} fig3_cmd_args = {'input':fig3_pc, 'output':fig3_emp, 'mapping':fig3_taxa_mapping_fp} # filter the table down to just the AG fecal samples filter_job = submit(scripts['Filter Samples'] % fig3_filter_args) res = wait_on(filter_job) taxa_job = submit(scripts['Summarize Taxa Mapping'] % fig3_summarize_args) res = wait_on(taxa_job) check_file(fig3_taxa_mapping_fp) jobs = [] jobs.append(submit(scripts['Make Emperor'] % fig1_cmd_args)) jobs.append(submit(scripts['Make Emperor'] % fig2_cmd_args)) jobs.append(submit(scripts['Make Emperor'] % fig3_cmd_args)) res = wait_on(jobs) emp_index = lambda x: os.path.join(x, 'index.html') # form the expected paths for Emperor fig1 = emp_index(fig1_emp) fig2 = emp_index(fig2_emp) fig3 = emp_index(fig3_emp) # verify the expected files are present check_file(fig1) check_file(fig2) check_file(fig3) FileLink(fig1, result_html_prefix='

Figure 1:

') FileLink(fig2, result_html_prefix='

Figure 2:

') FileLink(fig3, result_html_prefix='

Figure 3:

') DOWNLOAD_DIRECTORY = os.path.expandvars('$HOME') FIGURE1_EXPECTED_FILENAME = 'Figure_1.tar.gz' FIGURE2_EXPECTED_FILENAME = 'Figure_2.tar.gz' FIGURE3_EXPECTED_FILENAME = 'Figure_3.tar.gz' # helper function for creating wildcard paths source_path = lambda x,y: os.path.join(x,y) # setup the destination paths emperor_images = get_relative_new_path('emperor_images_svg') if not os.path.exists(emperor_images): os.mkdir(emperor_images) # setup the source paths figure1_src = source_path(DOWNLOAD_DIRECTORY, FIGURE1_EXPECTED_FILENAME) figure2_src = source_path(DOWNLOAD_DIRECTORY, FIGURE2_EXPECTED_FILENAME) figure3_src = source_path(DOWNLOAD_DIRECTORY, FIGURE3_EXPECTED_FILENAME) check_file(figure1_src) check_file(figure2_src) check_file(figure3_src) # unpack the tarballs jobs = [] jobs.append(submit('tar xzf %s -C %s' % (figure1_src, emperor_images))) jobs.append(submit('tar xzf %s -C %s' % (figure2_src, emperor_images))) jobs.append(submit('tar xzf %s -C %s' % (figure3_src, emperor_images))) res = wait_on(jobs) all_ag_IDs = set([l.strip().split('\t')[0] for l in open(ag_100nt_m_fp) if not l.startswith('#')]) all_emp_SVGs = os.listdir(emperor_images) template_files = get_relative_new_path('template_files') if not os.path.exists(template_files): os.mkdir(template_files) svg_smash_args = {'input':emperor_images, 'output':template_files, 'prefix':None, 'sample_id':None} commands = construct_svg_smash_commands(all_emp_SVGs, all_ag_IDs, scripts['SVG Smash'], svg_smash_args) res = farm_commands(commands, 25) fig4_sex_fp = get_relative_new_path('fig4_sex.biom') fig4_age_fp = get_relative_new_path('fig4_age_category.biom') fig4_diet_fp = get_relative_new_path('fig4_diet.biom') fig4_bmi_fp = get_relative_new_path('fig4_bmi_category.biom') ag_fecal_t_1k_fp = get_relative_new_path('ag_fecal_even1k.biom') ag_oral_t_1k_fp = get_relative_new_path('ag_oral_even1k.biom') ag_skin_t_1k_fp = get_relative_new_path('ag_skin_even1k.biom') template_files = get_relative_existing_path('template_files') check_file(hmp_gg_ag_pgp_t_1k_fp) check_file(fig4_m_fp) check_file(oral_m_fp) check_file(skin_m_fp) filter_fecal_args = {'input':hmp_gg_ag_pgp_t_1k_fp, 'output':ag_fecal_t_1k_fp, 'sample_id_fp':fig4_m_fp} filter_oral_args = {'input':hmp_gg_ag_pgp_t_1k_fp, 'output':ag_oral_t_1k_fp, 'sample_id_fp':oral_m_fp} filter_skin_args = {'input':hmp_gg_ag_pgp_t_1k_fp, 'output':ag_skin_t_1k_fp, 'sample_id_fp':skin_m_fp} otu_by_cat_sex_args = {'otu_table':ag_fecal_t_1k_fp, 'output':fig4_sex_fp, 'mapping':fig4_m_fp, 'category':'SEX'} otu_by_cat_age_args = {'otu_table':ag_fecal_t_1k_fp, 'output':fig4_age_fp, 'mapping':fig4_m_fp, 'category':'AGE_CATEGORY'} otu_by_cat_diet_args = {'otu_table':ag_fecal_t_1k_fp, 'output':fig4_diet_fp, 'mapping':fig4_m_fp, 'category':'DIET_TYPE'} otu_by_cat_bmi_args = {'otu_table':ag_fecal_t_1k_fp, 'output':fig4_bmi_fp, 'mapping':fig4_m_fp, 'category':'BMI_CATEGORY'} phyla_plot_args = {'input':ag_fecal_t_1k_fp, 'output':template_files, 'mapping':fig4_m_fp, 'debug':"-d" if debug else "", 'categories':'SEX:%s, AGE_CATEGORY:%s, DIET_TYPE:%s, BMI_CATEGORY:%s' % (fig4_sex_fp, fig4_age_fp, fig4_diet_fp, fig4_bmi_fp)} # filter the ag table down to just the fecal samples for fig 4 filter_jobs = [] filter_jobs.append(submit(scripts['Filter Samples'] % filter_fecal_args)) filter_jobs.append(submit(scripts['Filter Samples'] % filter_oral_args)) filter_jobs.append(submit(scripts['Filter Samples'] % filter_skin_args)) res = wait_on(filter_jobs) # get the summarized tables jobs = [] jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_sex_args)) jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_age_args)) jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_diet_args)) jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_bmi_args)) res = wait_on(jobs) # farm out the phyla plots sample_ids = [l.split('\t')[0] for l in open(fig4_m_fp) if not l.startswith('#')] commands = construct_phyla_plots_cmds(sample_ids, scripts['Make Phyla Plots'], phyla_plot_args) res = farm_commands(commands, 5) ag_fecal_t_norare_fp = get_relative_new_path('ag_fecal_norare_filtered.biom') ag_skin_t_norare_fp = get_relative_new_path('ag_skin_norare_filtered.biom') ag_oral_t_norare_fp = get_relative_new_path('ag_oral_norare_filtered.biom') fecal_tax_sum = get_relative_new_path('ag_fecal_norare_taxasum') skin_tax_sum = get_relative_new_path('ag_skin_norare_taxasum') oral_tax_sum = get_relative_new_path('ag_oral_norare_taxasum') template_files = get_relative_existing_path('template_files') filter_fecal_args = {'input':ag_100nt_t_fp, 'output':ag_fecal_t_norare_fp, 'sample_id_fp':fig4_m_fp} filter_skin_args = {'input':ag_100nt_t_fp, 'output':ag_skin_t_norare_fp, 'sample_id_fp':skin_m_fp} filter_oral_args = {'input':ag_100nt_t_fp, 'output':ag_oral_t_norare_fp, 'sample_id_fp':oral_m_fp} sum_fecal_args = {'input':ag_fecal_t_norare_fp, 'output':fecal_tax_sum, 'level':"2,3,6"} sum_skin_args = {'input':ag_skin_t_norare_fp, 'output':skin_tax_sum, 'level':"2,3,6"} sum_oral_args = {'input':ag_oral_t_norare_fp, 'output':oral_tax_sum, 'level':"2,3,6"} filter_jobs = [] filter_jobs.append(submit(scripts['Filter Samples'] % filter_fecal_args)) filter_jobs.append(submit(scripts['Filter Samples'] % filter_skin_args)) filter_jobs.append(submit(scripts['Filter Samples'] % filter_oral_args)) res = wait_on(filter_jobs) sum_jobs = [] sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_fecal_args)) sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_skin_args)) sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_oral_args)) res = wait_on(sum_jobs) sum_fecal_args = {'input':ag_fecal_t_norare_fp, 'output':fecal_tax_sum, 'level':"2,3,6"} res = wait_on(submit(scripts['Summarize Taxa'] % sum_fecal_args)) template_files = get_relative_existing_path('template_files') fig5_lvl6 = get_relative_existing_path('ag_fecal_norare_taxasum/ag_fecal_norare_filtered_L6.biom') sig_args = {'input':fig5_lvl6, 'output':template_files} from time import sleep jobs = [] sample_ids = [l.split('\t')[0] for l in open(fig4_m_fp) if not l.startswith('#')] for id_ in sample_ids: args = sig_args.copy() args['samples'] = id_ cmd = scripts['OTU Significance'] % args jobs.append(submit(cmd)) sleep(0.1) res = wait_on(jobs) from biom.util import biom_open with biom_open(fig5_lvl6) as table: per_sample_taxa_summaries(table, get_relative_new_path('template_files/Figure_6_%s.txt')) static_paths = {'template': template, 'aglogo':aglogo, 'fig1_legend': fig1_legend, 'fig2_legend': fig2_legend, 'fig2_2ndlegend': fig2_2ndlegend, 'fig3_legend': fig3_legend, 'fig4_overlay': fig4_overlay, 'fig1_ovals': fig1_ovals, 'fig2_ovals': fig2_ovals, 'ball_legend': ball_legend, 'title': title, 'working_dir': working_dir} check_file(ag_100nt_m_massaged_fp) check_file(template) if not os.path.exists(get_relative_new_path('unidentified')): os.mkdir(get_relative_new_path('unidentified')) if participants is not None and os.path.exists(get_relative_new_path('identified')): os.mkdir(get_relative_new_path('identified')) base_setup_cmd = 'cd %s; %s' indiv_cmds, latex_cmds, missing = construct_bootstrap_and_latex_commands(all_ag_IDs, participants, get_relative_existing_path, static_paths, base_setup_cmd, scripts['To PDF']) _ = farm_commands(indiv_cmds, 25) _ = farm_commands(latex_cmds, 25) if participants is not None: res = farm_commands(harvest(get_relative_existing_path('identified')), 50) res = farm_commands(harvest(get_relative_existing_path('unidentified')), 50) if participants is not None: harvest_path = get_path('identified/harvested') identified_smash = pdf_smash(harvest_path, 'identified', previously_printed=prev_printed) res = farm_commands(identified_smash, 1) print working_dir # setup paths key_taxa = get_relative_existing_path('stacked_plots_key_taxa.txt') fecal_tax_sum_1k = get_relative_new_path('ag_fecal_1k_taxasum') oral_tax_sum_1k = get_relative_new_path('ag_oral_1k_taxasum') skin_tax_sum_1k = get_relative_new_path('ag_skin_1k_taxasum') fecal_tax_sum_1k_phy = get_relative_new_path('ag_fecal_1k_taxasum/ag_fecal_even1k_L2.txt') oral_tax_sum_1k_phy = get_relative_new_path('ag_oral_1k_taxasum/ag_oral_even1k_L2.txt') skin_tax_sum_1k_phy = get_relative_new_path('ag_skin_1k_taxasum/ag_skin_even1k_L2.txt') uw_unif_dm_fecal = get_relative_new_path('unweighted_unifrac_HMP_GG_AG_PGP_100nt_even1k_fecal.txt') uw_unif_dm_fecal_hmp = get_relative_new_path('unweighted_unifrac_HMP_even1k_fecal.txt') uw_unif_dm_fecal_agp = get_relative_new_path('unweighted_unifrac_AGP_even1k_fecal.txt') w_unif_dm_fecal = get_relative_new_path('weighted_unifrac_HMP_GG_AG_PGP_100nt_even1k_fecal.txt') w_unif_dm_fecal_hmp = get_relative_new_path('weighted_unifrac_HMP_even1k_fecal.txt') w_unif_dm_fecal_agp = get_relative_new_path('weighted_unifrac_AGP_even1k_fecal.txt') qiime_charts_config = get_relative_existing_path('metadata_charts.json') fig5 = get_relative_new_path('fig5.pdf') jobs = [] # get summary counts agp_sample_count = count_samples(open(mapping_fp)) agp_seq_count = count_seqs(open(merged_sequences_full_length_fp)) agp_unique_participants = count_unique_participants(open(mapping_fp)) pgp_sample_count = count_samples(open(ag_100nt_m_massaged_fp), criteria={'IS_PGP': 'Yes'}) pgp_seq_count = count_seqs(open(bloom_filtered_100nt), subset=pgp_ids) pgp_unique_participants = count_unique_participants(open(ag_100nt_m_massaged_fp), criteria={'IS_PGP': 'Yes'}) # from the GET2012 sampling which were not part of the American Gut pgp_sample_count += 439 pgp_unique_participants += 86 pgp_seq_count += 9509776 # summarize taxa for the stacked taxa plots sum_fecal_args = {'input':ag_fecal_t_1k_fp, 'output':fecal_tax_sum_1k, 'level':"2"} sum_oral_args = {'input':ag_oral_t_1k_fp, 'output':oral_tax_sum_1k, 'level':"2"} sum_skin_args = {'input':ag_skin_t_1k_fp, 'output':skin_tax_sum_1k, 'level':"2"} jobs.append(submit(scripts['Summarize Taxa'] % sum_fecal_args)) jobs.append(submit(scripts['Summarize Taxa'] % sum_oral_args)) jobs.append(submit(scripts['Summarize Taxa'] % sum_skin_args)) res = wait_on(jobs) jobs = [] # setup the stacked taxa plots commands agplots_args = {'mapping': ag_100nt_m_massaged_fp, 'taxa': None, 'metadata_cat': 'SIMPLE_BODY_SITE', 'metadata_val': None, 'output_prefix': None, 'key_taxa': key_taxa} agplots_fecal = agplots_args.copy() agplots_fecal['taxa'] = fecal_tax_sum_1k_phy agplots_fecal['metadata_val'] = 'FECAL' agplots_fecal['output_prefix'] = 'ag_plots_fecal_' agplots_fecal['identified'] = 'fecal_identified.txt' agplots_oral = agplots_args.copy() agplots_oral['taxa'] = oral_tax_sum_1k_phy agplots_oral['metadata_val'] = 'ORAL' agplots_oral['output_prefix'] = 'ag_plots_oral_' agplots_oral['identified'] = 'oral_identified.txt' agplots_skin = agplots_args.copy() agplots_skin['taxa'] = skin_tax_sum_1k_phy agplots_skin['metadata_val'] = 'SKIN' agplots_skin['output_prefix'] = 'ag_plots_skin_' agplots_skin['identified'] = 'skin_identified.txt' # setup the initial filtering for the beta diversity HMP/AGP comparison filter_uwdm_to_fecal = {'input': fig1_bdiv, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'SIMPLE_BODY_SITE:FECAL', 'output': uw_unif_dm_fecal} filter_wdm_to_fecal = {'input': w_bdiv_path, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'SIMPLE_BODY_SITE:FECAL', 'output': w_unif_dm_fecal} # submit the stacked taxa plots, metadata summaries (QIIME Charts) and the initial beta diversity filtering commands jobs.append(submit(scripts['AGPlots'] % agplots_fecal)) jobs.append(submit(scripts['AGPlots'] % agplots_oral)) jobs.append(submit(scripts['AGPlots'] % agplots_skin)) jobs.append(submit(scripts['QIIME Charts'] % {'input': qiime_charts_config})) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal)) res = wait_on(jobs) jobs = [] # second round of beta diversity filtering into project specific results filter_uwdm_to_fecal_hmp = {'input': uw_unif_dm_fecal, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:HMP', 'output': uw_unif_dm_fecal_hmp} filter_uwdm_to_fecal_agp = {'input': uw_unif_dm_fecal, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:AGP', 'output': uw_unif_dm_fecal_agp} filter_wdm_to_fecal_hmp = {'input': w_unif_dm_fecal, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:HMP', 'output': w_unif_dm_fecal_hmp} filter_wdm_to_fecal_agp = {'input': w_unif_dm_fecal, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:AGP', 'output': w_unif_dm_fecal_agp} jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal_hmp)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal_agp)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal_hmp)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal_agp)) res = wait_on(jobs) # perform the beta diversity comparison of the HMP and AGP bdiv_compare_w = {'inputs': ','.join([w_unif_dm_fecal_hmp, w_unif_dm_fecal_agp]), 'output': fig5 + '.dtest.pdf', 'labels': 'HMP,AGP', 'title': "'Beta diversity added by sampled microbial communities'", 'ylabel': "'Diversity (weighted UniFrac)'"} res = wait_on(submit(scripts['Beta Diversity Comparison'] % bdiv_compare_w)) pc_path = lambda x: x.rsplit('.txt',1)[0] + '_pc.txt' emp_path = lambda x: x.rsplit('.txt',1)[0] + '-emp' emp_index = lambda x: os.path.join(x, 'index.html') # filter beta diversity to just HMP and AGP uw_unif_dm_hmp_and_agp = get_relative_new_path('unweighted_unifrac_HMP_AG_100nt_even1k.txt') uw_unif_dm_pgp_and_agp = get_relative_new_path('unweighted_unifrac_PGP_AG_100nt_even1k.txt') uw_unif_dm_gg_and_agp = get_relative_new_path('unweighted_unifrac_GG_AG_100nt_even1k.txt') filter_uwdm_to_hmp_and_agp = {'input': fig1_bdiv, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:HMP,AGP', 'output': uw_unif_dm_hmp_and_agp} filter_uwdm_to_pgp_and_agp = {'input': fig1_bdiv, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': 'TITLE_ACRONYM:AGP,PGP', 'output': uw_unif_dm_pgp_and_agp} # just show US/Venezuela/Malawi filter_uwdm_to_gg_and_agp = {'input': fig2_bdiv, 'mapping': hmp_pgp_ag_gg_mm_fp, 'states': "'COUNTRY:United States of America,Malawi,Venezuela'", 'output': uw_unif_dm_gg_and_agp} jobs = [] jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_hmp_and_agp)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_pgp_and_agp)) jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_gg_and_agp)) res = wait_on(jobs) # compute principal coordinates uw_unif_pc_hmp_and_agp = pc_path(uw_unif_dm_hmp_and_agp) uw_unif_pc_pgp_and_agp = pc_path(uw_unif_dm_pgp_and_agp) uw_unif_pc_gg_and_agp = pc_path(uw_unif_dm_gg_and_agp) hmp_agp_pc_cmd_args = {'input':uw_unif_dm_hmp_and_agp, 'output':uw_unif_pc_hmp_and_agp} pgp_agp_pc_cmd_args = {'input':uw_unif_dm_pgp_and_agp, 'output':uw_unif_pc_pgp_and_agp} gg_agp_pc_cmd_args = {'input':uw_unif_dm_gg_and_agp, 'output':uw_unif_pc_gg_and_agp} jobs = [] jobs.append(submit(scripts['Principal Coordinates'] % hmp_agp_pc_cmd_args)) jobs.append(submit(scripts['Principal Coordinates'] % pgp_agp_pc_cmd_args)) jobs.append(submit(scripts['Principal Coordinates'] % gg_agp_pc_cmd_args)) res = wait_on(jobs) # produce PCoAs uw_unif_emp_hmp_and_agp = emp_path(uw_unif_pc_hmp_and_agp) uw_unif_emp_pgp_and_agp = emp_path(uw_unif_pc_pgp_and_agp) uw_unif_emp_gg_and_agp = emp_path(uw_unif_pc_gg_and_agp) hmp_agp_emp_cmd_args = {'input':uw_unif_pc_hmp_and_agp, 'output':uw_unif_emp_hmp_and_agp, 'mapping':fig1_m_fp} pgp_agp_emp_cmd_args = {'input':uw_unif_pc_pgp_and_agp, 'output':uw_unif_emp_pgp_and_agp, 'mapping':fig1_m_fp} gg_agp_emp_cmd_args = {'input':uw_unif_pc_gg_and_agp, 'output':uw_unif_emp_gg_and_agp, 'mapping':fig2_m_fp} jobs = [] jobs.append(submit(scripts['Make Emperor'] % hmp_agp_emp_cmd_args)) jobs.append(submit(scripts['Make Emperor'] % pgp_agp_emp_cmd_args)) jobs.append(submit(scripts['Make Emperor'] % gg_agp_emp_cmd_args)) res = wait_on(jobs) FileLink(emp_index(uw_unif_emp_hmp_and_agp), result_html_prefix='

HMP and AGP, please save an SVG as "agp_hmp"

') FileLink(emp_index(uw_unif_emp_pgp_and_agp), result_html_prefix='

PGP and AGP, please save an SVG as "agp_pgp"

') FileLink(emp_index(uw_unif_emp_gg_and_agp), result_html_prefix='

GG and AGP, please save an SVG as "agp_gg_age" and "agp_gg_country"

') # SVG to PDF agp_hmp_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_hmp.svg') agp_pgp_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_pgp.svg') agp_gg_age_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_gg_age.svg') agp_gg_country_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_gg_country.svg') check_file(agp_hmp_svg) check_file(agp_pgp_svg) check_file(agp_gg_age_svg) check_file(agp_gg_country_svg) agp_hmp_pdf = get_relative_new_path('agp_hmp.pdf') agp_pgp_pdf = get_relative_new_path('agp_pgp.pdf') agp_gg_age_pdf = get_relative_new_path('agp_gg_age.pdf') agp_gg_country_pdf = get_relative_new_path('agp_gg_country.pdf') agp_hmp_args = {'input': agp_hmp_svg, 'output': agp_hmp_pdf} agp_pgp_args = {'input': agp_pgp_svg, 'output': agp_pgp_pdf} agp_gg_age_args = {'input': agp_gg_age_svg, 'output': agp_gg_age_pdf} agp_gg_country_args = {'input': agp_gg_country_svg, 'output': agp_gg_country_pdf} jobs = [] jobs.append(submit(scripts['SVG to PDF'] % agp_hmp_args)) jobs.append(submit(scripts['SVG to PDF'] % agp_pgp_args)) jobs.append(submit(scripts['SVG to PDF'] % agp_gg_age_args)) jobs.append(submit(scripts['SVG to PDF'] % agp_gg_country_args)) res = wait_on(jobs) macros_fp = get_relative_new_path('mod1_macros.tex') # get the date and format for human consumption (e.g., January 1, 2014) cur_date = datetime.datetime.now() date_fmt = cur_date.strftime("%B %d, %Y") # format the counts into comma separated (e.g., 1,234,567) agp_samples_fmt = locale.format("%d", agp_sample_count, grouping=True) agp_participants_fmt = locale.format("%d", agp_unique_participants, grouping=True) agp_sequences_fmt = locale.format("%d", agp_seq_count, grouping=True) pgp_samples_fmt = locale.format("%d", pgp_sample_count, grouping=True) pgp_participants_fmt = locale.format("%d", pgp_unique_participants, grouping=True) pgp_sequences_fmt = locale.format("%d", pgp_seq_count, grouping=True) # build the macro template for the latex document macro_template = ["% release date"] macro_template.append("\def\\releaseDate{%s}" % date_fmt) macro_template.append("% participants paragraph") macro_template.append("\def\\numSamples{%s}" % agp_samples_fmt) macro_template.append("\def\\numParticipants{%s}" % agp_participants_fmt) # published studies macro_template.append("% participants table") macro_template.append("\def\hmpAge{Adults}") macro_template.append("\def\hmpLocation{USA}") macro_template.append("\def\hmpSamples{4,788}") macro_template.append("\def\hmpParticipants{242}") macro_template.append("\def\hmpSequences{36,797,226}") macro_template.append("\def\ggAge{Adults,Children}") macro_template.append("\def\ggLocation{Venezuela, Malawi, USA}") macro_template.append("\def\ggSamples{531}") macro_template.append("\def\ggParticipants{531}") macro_template.append("\def\ggSequences{1,093,740,274}") # growing studies macro_template.append("\def\pgpAge{Adults}") macro_template.append("\def\pgpLocation{USA}") macro_template.append("\def\pgpSamples{%s}" % pgp_samples_fmt) macro_template.append("\def\pgpParticipants{%s}" % pgp_participants_fmt) macro_template.append("\def\pgpSequences{%s}" % pgp_sequences_fmt) macro_template.append("\def\\agpAge{Adults, Children}") macro_template.append("\def\\agpLocation{Global}") macro_template.append("\def\\agpSamples{%s}" % agp_samples_fmt) macro_template.append("\def\\agpParticipants{%s}" % agp_participants_fmt) macro_template.append("\def\\agpSequences{%s}" % agp_sequences_fmt) macro_template.append("% diversity figure") macro_template.append("\def\\numParticipantsLowerEstimate{%s}" % agp_participants_fmt) macros = open(macros_fp, 'w') macros.write('\n'.join(macro_template)) macros.write('\n') macros.close() # summary static images for_mod1 = ['ag_plots_fecal_legend.pdf', 'ag_plots_fecal_stack.pdf', 'ag_plots_oral_stack.pdf', 'ag_plots_skin_stack.pdf', 'agp_gg_age.pdf', 'agp_gg_country.pdf', 'agp_hmp.pdf', 'agp_pgp.pdf', 'fig2.pdf', 'fig3.pdf', 'fig5.pdf', 'fig6_a.pdf', 'fig6_b.pdf', 'fig6_legend.pdf', 's1.pdf', 's2.pdf', 's3.pdf', 'logoshape.pdf', 'legend_age.pdf', 'legend_agp_gg.pdf', 'legend_agp_hmp_pgp.pdf' ] os.mkdir(get_relative_new_path('pdfs-mod1')) for f in for_mod1: shutil.copy(get_relative_existing_path(f), get_relative_new_path('pdfs-mod1/')) res = wait_on(submit(scripts['To PDF'] % {'path': working_dir, 'input': 'mod1_main.tex'}))