# process a small subset of the data if True
debug = False

# EBI accesions to process
accessions = ['ERP003819', 
              'ERP003822', 
              'ERP003820', 
              'ERP003821', 
              'ERP005367', 
              'ERP005366', 
              'ERP005361', 
              'ERP005362', 
              'ERP005651', 
              'ERP005821', 
              'ERP005949', 
              'ERP006349', 
              'ERP008512', 
              'ERP008604', 
              'ERP008617']

# set the path to the Greengenes 13_5 97% OTU tree
reference_rep_set = '/shared/gg_13_8_otus/rep_set/97_otus.fasta' # e.g., path to 97_otus.fasta from Greengenes
reference_taxonomy = '/shared/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt' # e.g., path to 97_otu_taxonomy.txt from Grengenes
reference_tree = '/shared/gg_13_8_otus/trees/97_otus.tree' # e.g., path to 97_otus.tree from Grengenes

%run cluster_utils.ipy
import os
import locale
import datetime
import shutil
from functools import partial
from tempfile import mktemp
from collections import defaultdict
from cPickle import loads
from itertools import izip
from ftplib import FTP
from tarfile import open as tar_open
from gzip import open as gz_open
from glob import glob

from IPython.lib.display import FileLink

from americangut.util import fetch_study, trim_fasta, concatenate_files
from americangut.results_utils import (check_file, get_path, get_repository_dir, stage_static_files, 
                                       parse_identifying_data, filter_mapping_file, clean_and_reformat_mapping,
                                       parse_previously_printed, bootstrap_result, MissingFigure,
                                       construct_svg_smash_commands, construct_phyla_plots_cmds,
                                       per_sample_taxa_summaries, construct_bootstrap_and_latex_commands,
                                       count_unique_sequences_per_otu, write_bloom_fasta,
                                       harvest) 
from americangut.util import count_samples, count_seqs, count_unique_participants

locale.setlocale(locale.LC_ALL, 'en_US')

# get the current absolute path
current_dir = os.path.abspath('.')

# get the path to the American Gut repository 
repo_dir = get_repository_dir()

# create a place to do work
prj_name = "americangut_results_r1-14"
working_dir = os.path.join(current_dir, prj_name)
os.makedirs(prj_name)

# path wrappers
get_relative_new_path = lambda x: os.path.join(working_dir, x)
get_relative_existing_path = partial(get_path, working_dir)

# set the number of processors parallel tasks will use
NUM_PROCS = 100
NUM_PROCS_OTU_PICKING = 64

# bootstrap the submit method
submit = partial(submit, prj_name)
submit_smr = lambda x: submit_qsub(x, prj_name, queue='mem512gbq', extra_args='-l ncpus=64 -l walltime=72:00:00')

# make sure our reference files exist
check_file(reference_rep_set)
check_file(reference_taxonomy)
check_file(reference_tree)

if debug:
    sequence_files = [os.path.join(repo_dir, 'data', 'AG_debug', 'test_seqs.fna'),
                      os.path.join(repo_dir, 'data', 'AG_debug', 'test_seqs_2.fna')]
    mapping_files = [os.path.join(repo_dir, 'data', 'AG_debug', 'test_mapping.txt')]
else:
    sequence_files = [get_relative_new_path(acc + '.fna') for acc in accessions]
    mapping_files = [get_relative_new_path(acc + '.txt') for acc in accessions]
    
params_file = get_relative_new_path('sortmerna_pick_params.txt')
with open(params_file, 'w') as f:
    f.write("pick_otus:otu_picking_method sortmerna\n")
    f.write("pick_otus:similarity 0.97\n")
    f.write("pick_otus:threads %d\n" % NUM_PROCS_OTU_PICKING)

scripts = {
    'Merge OTU Tables':'merge_otu_tables.py -i %(input_a)s,%(input_b)s -o %(output)s',
    'Single Rarifaction':'single_rarefaction.py -i %(input)s -o %(output)s -d %(depth)s',
    'Parallel Beta Diversity':'parallel_beta_diversity.py -i %(input)s -o %(output)s -X %(job_prefix)s -O %(num_jobs)s -t %(gg97_tree)s',
    'Principal Coordinates':'principal_coordinates.py -i %(input)s -o %(output)s',
    'Merge Mapping Files':'merge_mapping_files.py -m %(input_a)s,%(input_b)s -o %(output)s',
    'Filter Samples':'filter_samples_from_otu_table.py -i %(input)s -o %(output)s --sample_id_fp=%(sample_id_fp)s',
    'Summarize OTU by Category':'summarize_otu_by_cat.py -m %(mapping)s -o %(output)s -n -i %(otu_table)s -c %(category)s',
    'Filter Distance Matrix':'filter_distance_matrix.py -i %(input)s -o %(output)s --sample_id_fp=%(sample_ids)s',
    'Summarize Taxa':'summarize_taxa.py -i %(input)s -o %(output)s -L %(level)s',
    'Summarize Taxa Mapping':'summarize_taxa.py -i %(input)s -o %(output)s -L %(level)s -m %(mapping)s',
    'Taxonomy Comparison':'taxonomy_comparison.py -i %(input)s -m %(mapping)s -l %(level)s -o %(output)s -c %(list_of_categories)s',
    'Make Emperor':'make_emperor.py -i %(input)s -o %(output)s -m %(mapping)s',
    'SVG Smash':'replace_svg_object.py -i %(input)s -o %(output)s --prefix %(prefix)s --sample_id=%(sample_id)s',
    'Make Phyla Plots':"make_phyla_plots_AGP.py -i %(input)s -m %(mapping)s -o %(output)s -c '%(categories)s' -s %(samples)s %(debug)s",
    'OTU Significance':"generate_otu_signifigance_tables_AGP.py -i %(input)s -o %(output)s -s %(samples)s",
    'Create Titles':'create_titles.py -m %(mapping)s -f',
    'SVG to PDF':'inkscape -z -D -f %(input)s -A %(output)s',
    'Format Template':'format_file.py -i %(template)s -k %(keys_for_replace)s -v %(values_for_replace)s -K %(keys_for_insert)s -V %(values_for_insert)s -o %(output)s',
    'To PDF':'module load texlive_2013; cd %(path)s; lualatex %(input)s',
    'PDF Smash':'gs -r150 -q -sPAPERSIZE=ledger -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dFIXEDMEDIA -dPDFFitPage -dCompatibilityLevel=1.4 -sOutputFile=%(output)s -c 100000000 setvmthreshold -f %(pdfs)s',
    'gunzip': 'gunzip -f %(input)s',
    'Pick Closed Reference OTUs': 'pick_closed_reference_otus.py -i %(input)s -o %(output)s -r %(reference)s -t %(taxonomy)s ' + '-p ' + params_file,
    'Filter Sequences': 'filter_fasta.py -f %(input)s -m %(otus)s -n -o %(output)s',
    'QIIME Charts': 'qiimecharts run-config -i %(input)s',
    'Filter Sequences to Fecal': 'filter_fasta.py -f %(input)s -o %(output)s --mapping_fp %(mapping)s --valid_states "%(states)s"',
    'AGPlots': 'make_plots.py -m %(mapping)s -t %(taxa)s -f pdf -s %(identified)s -c %(metadata_cat)s -v %(metadata_val)s -o %(output_prefix)s -k %(key_taxa)s',
    'Filter Distance Matrix by Metadata':'filter_distance_matrix.py -i %(input)s -o %(output)s -m %(mapping)s -s %(states)s',
    'Beta Diversity Comparison':'beta_sample_rarefaction.py -i %(inputs)s -o %(output)s -n 10 --y_max 0.7 -t %(title)s -l %(labels)s -y %(ylabel)s -v'
    }

# working directory file paths for tables and metadata. 
# ag -> American Gut
# pgp -> Personal Genome Project
# hmp -> Human Microbiome Project
# gg -> Global Gut
#
# _t_ -> table
# _m_ -> mapping file
#
# 100nt -> trimmed to the first 100 nucleotides 
 
stage_static_files('fecal', working_dir, debug=debug)

bloom_seqs = get_relative_existing_path('BLOOM.fasta')

jobs = []
for f in glob(os.path.join(working_dir, "*.biom.gz")):
    jobs.append(submit(scripts['gunzip'] % {'input': f}))
res = wait_on(jobs)

pgp_100nt_t_fp = get_relative_existing_path('PGP_100nt.biom')
pgp_100nt_m_fp = get_relative_existing_path('PGP_100nt.txt')
hmp_100nt_t_fp = get_relative_existing_path('HMPv35_100nt.biom')
hmp_100nt_m_fp = get_relative_existing_path('HMPv35_100nt.txt')
gg_100nt_t_fp  = get_relative_existing_path('GG_100nt.biom')
gg_100nt_m_fp  = get_relative_existing_path('GG_100nt.txt')
                                       
# participant static images
template       = get_relative_existing_path('template_gut.tex')
aglogo         = get_relative_existing_path('logoshape.pdf')
fig1_legend    = get_relative_existing_path('figure1_legend.pdf')
fig2_legend    = get_relative_existing_path('figure2_legend.pdf')
fig2_2ndlegend = get_relative_existing_path('figure2_country_legend.pdf')
fig3_legend    = get_relative_existing_path('figure3_legend.pdf')
fig1_ovals     = get_relative_existing_path('figure1_ovals.png')
fig2_ovals     = get_relative_existing_path('figure2_ovals.png')
fig4_overlay   = get_relative_existing_path('figure4_overlay.pdf')
ball_legend    = get_relative_existing_path('ball_legend.pdf')
title          = get_relative_existing_path('youramericangutsampletext.pdf')

to_obtain = []
for acc, seqs, map_ in zip(accessions, sequence_files, mapping_files):
    if not os.path.exists(seqs) or not os.path.exists(map_):
        to_obtain.append((acc, map_, seqs))

if not debug:
    for acc, map_, seqs in to_obtain:
        print "Fetching %s" % acc
        fetch_study(acc, map_, seqs)

if len(mapping_files) == 1:
    mapping_fp = mapping_files[0]
else:
    mapping_fp = get_relative_new_path('combined_mapping.txt')
    
    merge_args = {'input_a': mapping_files[0], 'input_b': mapping_files[1], 'output': mapping_fp}
    res = wait_on(submit(scripts['Merge Mapping Files'] % merge_args))
    for m in mapping_files[2:]:
        merge_args = {'input_a': mapping_fp, 'input_b': m, 'output': mapping_fp}
        res = wait_on(submit(scripts['Merge Mapping Files'] % merge_args))

# concatenate all sequence files into one merged sequence file. This can take a while!
merged_sequences_full_length_fp = get_relative_new_path('merged_sequences_full_length.fna')
with open(merged_sequences_full_length_fp, 'w') as merged_seqs:
    concatenate_files([open(f, 'U') for f in sequence_files], merged_seqs)

check_file(merged_sequences_full_length_fp)

ag_just_fecal = os.path.splitext(os.path.basename(merged_sequences_full_length_fp))[0] + '-fecal.fna'
filter_args = {'input': merged_sequences_full_length_fp,
               'output': ag_just_fecal,
               'mapping': mapping_fp,
               'states': ':'.join(('BODY_SITE', 'UBERON:feces'))}
    
wait_on(submit(scripts['Filter Sequences to Fecal'] % filter_args))

no_ext = os.path.splitext(os.path.basename(ag_just_fecal))[0]
bloom_otus = ag_just_fecal + '-bloom-otus'
bloom_hits = os.path.join(bloom_otus, 'sortmerna_picked_otus', no_ext + '_otus.txt')
bloom_args = {
    'input': ag_just_fecal,
    'output': bloom_otus,
    'reference': bloom_seqs,
    'taxonomy': reference_taxonomy,
}
wait_on(submit_smr(scripts['Pick Closed Reference OTUs'] % bloom_args))

bloom_filtered = os.path.splitext(merged_sequences_full_length_fp)[0] + '-debloomed.fna'
filter_args = {
    'input': merged_sequences_full_length_fp,
    'output': bloom_filtered,
    'otus': bloom_hits}
wait_on(submit(scripts['Filter Sequences'] % filter_args))

bloom_filtered_100nt = os.path.splitext(bloom_filtered)[0] + '-100nt.fna'
with open(bloom_filtered, 'U') as merged_seqs, open(bloom_filtered_100nt, 'w') as merged_seqs_trimmed:
    trim_fasta(merged_seqs, merged_seqs_trimmed, 100)

check_file(bloom_filtered_100nt)

params_file = get_relative_existing_path('sortmerna_pick_params.txt')
with open(params_file, 'a') as f:
    f.write("pick_otus.py:sortmerna_db /home/mcdonadt/ResearchWork/gg_13_8_otus/97_otus_idx\n")
    
full_length_otus = os.path.splitext(bloom_filtered)[0] + '-otus'
full_length_args = {
    'input': bloom_filtered,
    'output': full_length_otus,
    'reference': reference_rep_set,
    'taxonomy': reference_taxonomy,
}

trimmed_otus = os.path.splitext(bloom_filtered_100nt)[0] + '-otus'
trimmed_args = {
    'input': bloom_filtered_100nt,
    'output': trimmed_otus,
    'reference': reference_rep_set,
    'taxonomy': reference_taxonomy,
}

jobs = [submit_smr(scripts['Pick Closed Reference OTUs'] % full_length_args),
        submit_smr(scripts['Pick Closed Reference OTUs'] % trimmed_args)]
wait_on(jobs)

ag_100nt_m_fp  = get_relative_existing_path('combined_mapping.txt') 
ag_100nt_t_fp  = get_relative_existing_path('merged_sequences_full_length-debloomed-100nt-otus/otu_table.biom')

path_to_participant_data = None
passwd_for_participant_data = None
path_to_previously_printed = None

participants = parse_identifying_data(path_to_participant_data, passwd_for_participant_data)

prev_printed = parse_previously_printed(path_to_previously_printed)

# new file paths
ag_100nt_m_massaged_fp = get_relative_new_path('AG_100nt_massaged.txt')
gg_100nt_m_massaged_fp = get_relative_new_path('GG_100nt_massaged.txt')
pgp_100nt_m_massaged_fp = get_relative_new_path('PGP_100nt_massaged.txt')
hmp_100nt_m_massaged_fp = get_relative_new_path('HMP_100nt_massaged.txt')

# parse PGP IDs list that are in the American Gut
pgp_ids_fp = get_relative_existing_path('pgp_agp_barcodes.txt')
pgp_ids = [l.strip() for l in open(pgp_ids_fp) if not l.startswith('#')]

# massage
clean_and_reformat_mapping(open(ag_100nt_m_fp, 'U'), open(ag_100nt_m_massaged_fp, 'w'), 'body_site', 'AGP', pgp_ids=pgp_ids)
clean_and_reformat_mapping(open(gg_100nt_m_fp, 'U'), open(gg_100nt_m_massaged_fp, 'w'), 'body_site', 'GG')
clean_and_reformat_mapping(open(pgp_100nt_m_fp, 'U'), open(pgp_100nt_m_massaged_fp, 'w'), 'body_site', 'PGP', pgp_ids=True)
clean_and_reformat_mapping(open(hmp_100nt_m_fp, 'U'), open(hmp_100nt_m_massaged_fp, 'w'), 'bodysite', 'HMP')

# setup output paths, (mm -> massaged mapping)
hmp_pgp_mm_fp = get_relative_new_path('HMP_PGP_100nt_massaged.txt')
ag_gg_mm_fp = get_relative_new_path('AG_GG_100nt_massaged.txt')
hmp_pgp_ag_gg_mm_fp = get_relative_new_path('HMP_GG_AG_PGP_100nt_massaged.txt')

hmp_pgp_cmd_args = {'input_a':hmp_100nt_m_massaged_fp,
                  'input_b':pgp_100nt_m_massaged_fp,
                  'output':hmp_pgp_mm_fp}

ag_gg_cmd_args = {'input_a':ag_100nt_m_massaged_fp,
                  'input_b':gg_100nt_m_massaged_fp,
                  'output':ag_gg_mm_fp}

hmp_pgp_ag_gg_cmd_args = {'input_a':hmp_pgp_mm_fp,
                          'input_b':ag_gg_mm_fp,
                          'output':hmp_pgp_ag_gg_mm_fp}

# merge and block until completion
hmp_pgp_job = submit(scripts['Merge Mapping Files'] % hmp_pgp_cmd_args)
ag_gg_job = submit(scripts['Merge Mapping Files'] % ag_gg_cmd_args)
jobs = wait_on([hmp_pgp_job, ag_gg_job])

# merge and block until completion
hmp_pgp_ag_gg_job = submit(scripts['Merge Mapping Files'] % hmp_pgp_ag_gg_cmd_args)
jobs = wait_on(hmp_pgp_ag_gg_job)

fig1_m_fp = get_relative_new_path('HMP_GG_AG_PGP_figure1.txt')
fig2_m_fp = get_relative_new_path('AG_GG_fecal_figure2.txt')
fig3_m_fp = get_relative_new_path('AG_fecal_figure3.txt')
fig4_m_fp = get_relative_new_path('AG_fecal_figure4.txt')
oral_m_fp = get_relative_new_path('AG_oral.txt')
skin_m_fp = get_relative_new_path('AG_skin.txt')

# for the first PCoA, keep only these 5 columns regardless of value
fig1_filter_criteria = {'TITLE_ACRONYM':None, 
                        'SIMPLE_BODY_SITE':None,
                        'TITLE_BODY_SITE':None, 
                        'HMP_SITE':None,
                        'IS_PGP': None}

# for the second PCoA, we want only the same columns but only the fecal samples
fig2_filter_criteria = {'TITLE_ACRONYM':None, 
                        'AGE':None, 
                        'SIMPLE_BODY_SITE':lambda x: x == 'FECAL', 
                        'COUNTRY':None,
                        'IS_PGP': None}

# for the third PCoA, we want only two columns and only fecal samples
fig3_filter_criteria = {'TITLE_ACRONYM':None, 
                        'SIMPLE_BODY_SITE':lambda x: x == 'FECAL',
                        'IS_PGP': None}

# for the taxonomy figure, we want to retain fecal samples and a few additional categories
fig4_filter_criteria = {'TITLE_ACRONYM':None,
                        'AGE_CATEGORY':None,
                        'SEX':None,
                        'BMI_CATEGORY':None,
                        'DIET_TYPE':None,
                        'SIMPLE_BODY_SITE':lambda x: x == 'FECAL'}

oral_filter_criteria = {'TITLE_ACRONYM':None,
                        'AGE_CATEGORY':None,
                        'SEX':None,
                        'BMI_CATEGORY':None,
                        'DIET_TYPE':None,
                        'SIMPLE_BODY_SITE':lambda x: x == 'ORAL'}

skin_filter_criteria = {'TITLE_ACRONYM':None,
                        'AGE_CATEGORY':None,
                        'SEX':None,
                        'BMI_CATEGORY':None,
                        'DIET_TYPE':None,
                        'SIMPLE_BODY_SITE':lambda x: x == 'SKIN'}


filter_mapping_file(open(hmp_pgp_ag_gg_mm_fp, 'U'),    open(fig1_m_fp, 'w'), fig1_filter_criteria)
filter_mapping_file(open(ag_gg_mm_fp, 'U'),            open(fig2_m_fp, 'w'), fig2_filter_criteria)
filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(fig3_m_fp, 'w'), fig3_filter_criteria)
filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(fig4_m_fp, 'w'), fig4_filter_criteria)
filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(oral_m_fp, 'w'), oral_filter_criteria)
filter_mapping_file(open(ag_100nt_m_massaged_fp, 'U'), open(skin_m_fp, 'w'), skin_filter_criteria)

# resulting paths
hmp_pgp_t_fp = get_relative_new_path("HMP_PGP_100nt.biom")
ag_gg_t_fp = get_relative_new_path("AG_GG_100nt.biom")
hmp_gg_ag_pgp_t_fp = get_relative_new_path("HMP_GG_AG_PGP_100nt.biom")

# setup the command arguments for each call
hmp_pgp_cmd_args = {'input_a':hmp_100nt_t_fp, 
                   'input_b':pgp_100nt_t_fp,
                   'output':hmp_pgp_t_fp}
ag_gg_cmd_args = {'input_a':ag_100nt_t_fp,
                   'input_b':gg_100nt_t_fp,
                   'output':ag_gg_t_fp}
hmp_gg_ag_pgp_cmd_args = {'input_a':ag_gg_t_fp,
                          'input_b':hmp_pgp_t_fp,
                          'output':hmp_gg_ag_pgp_t_fp}

# merge and block until completion
hmp_pgp_job = submit(scripts['Merge OTU Tables'] % hmp_pgp_cmd_args)
ag_gg_job = submit(scripts['Merge OTU Tables'] % ag_gg_cmd_args)
jobs = wait_on([hmp_pgp_job, ag_gg_job])

# merge and block until completion
hmp_gg_ag_pgp_job = submit(scripts['Merge OTU Tables'] % hmp_gg_ag_pgp_cmd_args)
jobs = wait_on(hmp_gg_ag_pgp_job)

# resulting path
hmp_gg_ag_pgp_t_1k_fp = get_relative_new_path("HMP_GG_AG_PGP_100nt_even1k.biom")

# setup the command arguments
hmp_gg_ag_pgp_t_1k_cmd_args = {'input':hmp_gg_ag_pgp_t_fp,
                               'output':hmp_gg_ag_pgp_t_1k_fp,
                               'depth':'1000'}

# rarifiy and block until completion
hmp_gg_ag_pgp_t_1k_job = submit(scripts['Single Rarifaction'] % hmp_gg_ag_pgp_t_1k_cmd_args)
res = wait_on(hmp_gg_ag_pgp_t_1k_job)

# setup output directory
bdiv_dir = lambda x: 'bdiv_' + os.path.basename(x).split('.',1)[0]
hmp_gg_ag_pgp_1k_unweighted_unifrac_d = get_relative_new_path(bdiv_dir(hmp_gg_ag_pgp_t_1k_fp))
 
# setup beta diversity arguments
prefix = 'ag2_bdiv_'
hmp_gg_ag_pgp_cmd_args = {'input':hmp_gg_ag_pgp_t_1k_fp,
                          'output':hmp_gg_ag_pgp_1k_unweighted_unifrac_d,
                          'job_prefix':prefix,
                          'num_jobs':200,
                          'gg97_tree':reference_tree}

# submit and wait
hmp_gg_ag_pgp_bdiv_job = submit_qsub(scripts['Parallel Beta Diversity'] % hmp_gg_ag_pgp_cmd_args, prj_name, queue='memroute', extra_args='-l pvmem=16gb')
res = wait_on(hmp_gg_ag_pgp_bdiv_job, additional_prefix=prefix)

uw_bdiv_path = get_relative_existing_path(os.path.join(hmp_gg_ag_pgp_1k_unweighted_unifrac_d, 
                                                    'unweighted_unifrac_HMP_GG_AG_PGP_100nt_even1k.txt'))
w_bdiv_path = get_relative_existing_path(os.path.join(hmp_gg_ag_pgp_1k_unweighted_unifrac_d, 
                                                   'weighted_unifrac_HMP_GG_AG_PGP_100nt_even1k.txt')) 

full_bdiv = uw_bdiv_path
fig_path = lambda x: full_bdiv.rsplit('.txt',1)[0] + '-' + x + '.txt'
fig1_bdiv = fig_path('fig1')
fig2_bdiv = fig_path('fig2')
fig3_bdiv = fig_path('fig3')

fig1_cmd_args = {'input':full_bdiv,
                 'output':fig1_bdiv,
                 'sample_ids':fig1_m_fp}
fig2_cmd_args = {'input':full_bdiv,
                 'output':fig2_bdiv,
                 'sample_ids':fig2_m_fp}
fig3_cmd_args = {'input':full_bdiv,
                 'output':fig3_bdiv,
                 'sample_ids':fig3_m_fp}

jobs = []
jobs.append(submit(scripts['Filter Distance Matrix'] % fig1_cmd_args))
jobs.append(submit(scripts['Filter Distance Matrix'] % fig2_cmd_args))
jobs.append(submit(scripts['Filter Distance Matrix'] % fig3_cmd_args))
res = wait_on(jobs)

pc_path = lambda x: x.rsplit('.txt',1)[0] + '_pc.txt'

# verify the expected files are present
check_file(fig1_bdiv)
check_file(fig2_bdiv)
check_file(fig3_bdiv)

fig1_pc = pc_path(fig1_bdiv)
fig2_pc = pc_path(fig2_bdiv)
fig3_pc = pc_path(fig3_bdiv)

# setup our arguments
fig1_cmd_args = {'input':fig1_bdiv,
                 'output':fig1_pc}
fig2_cmd_args = {'input':fig2_bdiv,
                 'output':fig2_pc}
fig3_cmd_args = {'input':fig3_bdiv,
                 'output':fig3_pc}

# submit the jobs
jobs = []
jobs.append(submit(scripts['Principal Coordinates'] % fig1_cmd_args))
jobs.append(submit(scripts['Principal Coordinates'] % fig2_cmd_args))
jobs.append(submit(scripts['Principal Coordinates'] % fig3_cmd_args))
job_results = wait_on(jobs)

# quick little helper method
emp_path = lambda x: x.rsplit('.txt',1)[0] + '-emp'

# verify expected files are present
check_file(fig1_pc)
check_file(fig2_pc)
check_file(fig3_pc)

# setup output paths
fig1_emp = emp_path(fig1_pc)
fig2_emp = emp_path(fig2_pc)
fig3_emp = emp_path(fig3_pc)
fig3_filter = get_relative_new_path("figure3.biom")
fig3_taxa = get_relative_new_path("figure3_taxa")
fig3_taxa_mapping_fp = os.path.join(fig3_taxa, os.path.splitext(os.path.basename(fig3_m_fp))[0] + '_L2.txt')

# setup arguments
fig3_filter_args = {'input':hmp_gg_ag_pgp_t_1k_fp,
                    'output':fig3_filter,
                    'sample_id_fp':fig3_m_fp}
fig3_summarize_args = {'input':fig3_filter,
                       'output':fig3_taxa,
                       'level':'2',
                       'mapping':fig3_m_fp}

fig1_cmd_args = {'input':fig1_pc, 
                 'output':fig1_emp, 
                 'mapping':fig1_m_fp}
fig2_cmd_args = {'input':fig2_pc, 
                 'output':fig2_emp, 
                 'mapping':fig2_m_fp}
fig3_cmd_args = {'input':fig3_pc, 
                 'output':fig3_emp, 
                 'mapping':fig3_taxa_mapping_fp}

# filter the table down to just the AG fecal samples
filter_job = submit(scripts['Filter Samples'] % fig3_filter_args)
res = wait_on(filter_job)

taxa_job = submit(scripts['Summarize Taxa Mapping'] % fig3_summarize_args)
res = wait_on(taxa_job)

check_file(fig3_taxa_mapping_fp)

jobs = []
jobs.append(submit(scripts['Make Emperor'] % fig1_cmd_args))
jobs.append(submit(scripts['Make Emperor'] % fig2_cmd_args))
jobs.append(submit(scripts['Make Emperor'] % fig3_cmd_args))
res = wait_on(jobs)

emp_index = lambda x: os.path.join(x, 'index.html')

# form the expected paths for Emperor
fig1 = emp_index(fig1_emp)
fig2 = emp_index(fig2_emp)
fig3 = emp_index(fig3_emp)

# verify the expected files are present
check_file(fig1)
check_file(fig2)
check_file(fig3)

FileLink(fig1, result_html_prefix='<p>Figure 1:</p>')

FileLink(fig2, result_html_prefix='<p>Figure 2:</p>')

FileLink(fig3, result_html_prefix='<p>Figure 3:</p>')

DOWNLOAD_DIRECTORY = os.path.expandvars('$HOME')
FIGURE1_EXPECTED_FILENAME = 'Figure_1.tar.gz'
FIGURE2_EXPECTED_FILENAME = 'Figure_2.tar.gz'
FIGURE3_EXPECTED_FILENAME = 'Figure_3.tar.gz'

# helper function for creating wildcard paths
source_path = lambda x,y: os.path.join(x,y)

# setup the destination paths
emperor_images = get_relative_new_path('emperor_images_svg')

if not os.path.exists(emperor_images):
    os.mkdir(emperor_images)

# setup the source paths
figure1_src = source_path(DOWNLOAD_DIRECTORY, FIGURE1_EXPECTED_FILENAME)
figure2_src = source_path(DOWNLOAD_DIRECTORY, FIGURE2_EXPECTED_FILENAME)
figure3_src = source_path(DOWNLOAD_DIRECTORY, FIGURE3_EXPECTED_FILENAME)

check_file(figure1_src)
check_file(figure2_src)
check_file(figure3_src)

# unpack the tarballs
jobs = []
jobs.append(submit('tar xzf %s -C %s' % (figure1_src, emperor_images)))
jobs.append(submit('tar xzf %s -C %s' % (figure2_src, emperor_images)))
jobs.append(submit('tar xzf %s -C %s' % (figure3_src, emperor_images)))
res = wait_on(jobs)

all_ag_IDs = set([l.strip().split('\t')[0] for l in open(ag_100nt_m_fp) if not l.startswith('#')])
all_emp_SVGs = os.listdir(emperor_images)
template_files = get_relative_new_path('template_files')

if not os.path.exists(template_files):
    os.mkdir(template_files)

svg_smash_args = {'input':emperor_images, 
                  'output':template_files, 
                  'prefix':None, 
                  'sample_id':None}

commands = construct_svg_smash_commands(all_emp_SVGs, all_ag_IDs, scripts['SVG Smash'], svg_smash_args)
res = farm_commands(commands, 25)

fig4_sex_fp = get_relative_new_path('fig4_sex.biom')
fig4_age_fp = get_relative_new_path('fig4_age_category.biom')
fig4_diet_fp = get_relative_new_path('fig4_diet.biom')
fig4_bmi_fp = get_relative_new_path('fig4_bmi_category.biom')
ag_fecal_t_1k_fp = get_relative_new_path('ag_fecal_even1k.biom')
ag_oral_t_1k_fp = get_relative_new_path('ag_oral_even1k.biom')
ag_skin_t_1k_fp = get_relative_new_path('ag_skin_even1k.biom')
template_files = get_relative_existing_path('template_files')

check_file(hmp_gg_ag_pgp_t_1k_fp)
check_file(fig4_m_fp)
check_file(oral_m_fp)
check_file(skin_m_fp)

filter_fecal_args = {'input':hmp_gg_ag_pgp_t_1k_fp,
                     'output':ag_fecal_t_1k_fp,
                     'sample_id_fp':fig4_m_fp}

filter_oral_args = {'input':hmp_gg_ag_pgp_t_1k_fp,
                     'output':ag_oral_t_1k_fp,
                     'sample_id_fp':oral_m_fp}

filter_skin_args = {'input':hmp_gg_ag_pgp_t_1k_fp,
                     'output':ag_skin_t_1k_fp,
                     'sample_id_fp':skin_m_fp}

otu_by_cat_sex_args = {'otu_table':ag_fecal_t_1k_fp,
                       'output':fig4_sex_fp,
                       'mapping':fig4_m_fp,
                       'category':'SEX'}
otu_by_cat_age_args = {'otu_table':ag_fecal_t_1k_fp,
                       'output':fig4_age_fp,
                       'mapping':fig4_m_fp,
                       'category':'AGE_CATEGORY'}
otu_by_cat_diet_args = {'otu_table':ag_fecal_t_1k_fp,
                       'output':fig4_diet_fp,
                       'mapping':fig4_m_fp,
                       'category':'DIET_TYPE'}
otu_by_cat_bmi_args = {'otu_table':ag_fecal_t_1k_fp,
                       'output':fig4_bmi_fp,
                       'mapping':fig4_m_fp,
                       'category':'BMI_CATEGORY'}

phyla_plot_args = {'input':ag_fecal_t_1k_fp,
                   'output':template_files,
                   'mapping':fig4_m_fp,
                   'debug':"-d" if debug else "",
                   'categories':'SEX:%s, AGE_CATEGORY:%s, DIET_TYPE:%s, BMI_CATEGORY:%s' % (fig4_sex_fp, fig4_age_fp, fig4_diet_fp, fig4_bmi_fp)}
                
# filter the ag table down to just the fecal samples for fig 4
filter_jobs = []
filter_jobs.append(submit(scripts['Filter Samples'] % filter_fecal_args))
filter_jobs.append(submit(scripts['Filter Samples'] % filter_oral_args))
filter_jobs.append(submit(scripts['Filter Samples'] % filter_skin_args))
res = wait_on(filter_jobs)

# get the summarized tables
jobs = []
jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_sex_args))
jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_age_args))
jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_diet_args))
jobs.append(submit(scripts['Summarize OTU by Category'] % otu_by_cat_bmi_args))
res = wait_on(jobs)

# farm out the phyla plots
sample_ids = [l.split('\t')[0] for l in open(fig4_m_fp) if not l.startswith('#')]
commands = construct_phyla_plots_cmds(sample_ids, scripts['Make Phyla Plots'], phyla_plot_args)
res = farm_commands(commands, 5)

ag_fecal_t_norare_fp = get_relative_new_path('ag_fecal_norare_filtered.biom')
ag_skin_t_norare_fp = get_relative_new_path('ag_skin_norare_filtered.biom')
ag_oral_t_norare_fp = get_relative_new_path('ag_oral_norare_filtered.biom')

fecal_tax_sum = get_relative_new_path('ag_fecal_norare_taxasum')
skin_tax_sum = get_relative_new_path('ag_skin_norare_taxasum')
oral_tax_sum = get_relative_new_path('ag_oral_norare_taxasum')

template_files = get_relative_existing_path('template_files')

filter_fecal_args = {'input':ag_100nt_t_fp,
                     'output':ag_fecal_t_norare_fp,
                     'sample_id_fp':fig4_m_fp}

filter_skin_args = {'input':ag_100nt_t_fp,
                    'output':ag_skin_t_norare_fp,
                    'sample_id_fp':skin_m_fp}

filter_oral_args = {'input':ag_100nt_t_fp,
                    'output':ag_oral_t_norare_fp,
                    'sample_id_fp':oral_m_fp}

sum_fecal_args = {'input':ag_fecal_t_norare_fp,
                  'output':fecal_tax_sum,
                  'level':"2,3,6"}

sum_skin_args = {'input':ag_skin_t_norare_fp,
                 'output':skin_tax_sum,
                 'level':"2,3,6"}

sum_oral_args = {'input':ag_oral_t_norare_fp,
                  'output':oral_tax_sum,
                  'level':"2,3,6"}

filter_jobs = []
filter_jobs.append(submit(scripts['Filter Samples'] % filter_fecal_args))
filter_jobs.append(submit(scripts['Filter Samples'] % filter_skin_args))
filter_jobs.append(submit(scripts['Filter Samples'] % filter_oral_args))
res = wait_on(filter_jobs)

sum_jobs = []
sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_fecal_args))
sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_skin_args))
sum_jobs.append(submit(scripts['Summarize Taxa'] % sum_oral_args))
res = wait_on(sum_jobs)

sum_fecal_args = {'input':ag_fecal_t_norare_fp,
                  'output':fecal_tax_sum,
                  'level':"2,3,6"}
res = wait_on(submit(scripts['Summarize Taxa'] % sum_fecal_args))

template_files = get_relative_existing_path('template_files')
fig5_lvl6 = get_relative_existing_path('ag_fecal_norare_taxasum/ag_fecal_norare_filtered_L6.biom')

sig_args = {'input':fig5_lvl6,
            'output':template_files}

from time import sleep
jobs = []
sample_ids = [l.split('\t')[0] for l in open(fig4_m_fp) if not l.startswith('#')]
for id_ in sample_ids:
    args = sig_args.copy()
    args['samples'] = id_
    cmd = scripts['OTU Significance'] % args
    jobs.append(submit(cmd))
    sleep(0.1)
res = wait_on(jobs)

from biom.util import biom_open
with biom_open(fig5_lvl6) as table:
    per_sample_taxa_summaries(table, get_relative_new_path('template_files/Figure_6_%s.txt'))

static_paths = {'template': template,
                'aglogo':aglogo,
                'fig1_legend': fig1_legend,
                'fig2_legend': fig2_legend,
                'fig2_2ndlegend': fig2_2ndlegend,
                'fig3_legend': fig3_legend,
                'fig4_overlay': fig4_overlay,
                'fig1_ovals': fig1_ovals,
                'fig2_ovals': fig2_ovals,
                'ball_legend': ball_legend,
                'title': title,
                'working_dir': working_dir}

check_file(ag_100nt_m_massaged_fp)
check_file(template)

if not os.path.exists(get_relative_new_path('unidentified')):
    os.mkdir(get_relative_new_path('unidentified'))
if participants is not None and os.path.exists(get_relative_new_path('identified')):
    os.mkdir(get_relative_new_path('identified'))
             
base_setup_cmd = 'cd %s; %s'

indiv_cmds, latex_cmds, missing = construct_bootstrap_and_latex_commands(all_ag_IDs, participants, 
                                                                         get_relative_existing_path,
                                                                         static_paths, base_setup_cmd, 
                                                                         scripts['To PDF'])
_ = farm_commands(indiv_cmds, 25)
_ = farm_commands(latex_cmds, 25)

if participants is not None:
    res = farm_commands(harvest(get_relative_existing_path('identified')), 50)

res = farm_commands(harvest(get_relative_existing_path('unidentified')), 50)

if participants is not None:
    harvest_path = get_path('identified/harvested')
    identified_smash = pdf_smash(harvest_path, 'identified', previously_printed=prev_printed)
    res = farm_commands(identified_smash, 1)

print working_dir

# setup paths
key_taxa = get_relative_existing_path('stacked_plots_key_taxa.txt')

fecal_tax_sum_1k = get_relative_new_path('ag_fecal_1k_taxasum')
oral_tax_sum_1k = get_relative_new_path('ag_oral_1k_taxasum')
skin_tax_sum_1k = get_relative_new_path('ag_skin_1k_taxasum')

fecal_tax_sum_1k_phy = get_relative_new_path('ag_fecal_1k_taxasum/ag_fecal_even1k_L2.txt')
oral_tax_sum_1k_phy = get_relative_new_path('ag_oral_1k_taxasum/ag_oral_even1k_L2.txt')
skin_tax_sum_1k_phy = get_relative_new_path('ag_skin_1k_taxasum/ag_skin_even1k_L2.txt')

uw_unif_dm_fecal = get_relative_new_path('unweighted_unifrac_HMP_GG_AG_PGP_100nt_even1k_fecal.txt')
uw_unif_dm_fecal_hmp = get_relative_new_path('unweighted_unifrac_HMP_even1k_fecal.txt')
uw_unif_dm_fecal_agp = get_relative_new_path('unweighted_unifrac_AGP_even1k_fecal.txt')
w_unif_dm_fecal = get_relative_new_path('weighted_unifrac_HMP_GG_AG_PGP_100nt_even1k_fecal.txt')
w_unif_dm_fecal_hmp = get_relative_new_path('weighted_unifrac_HMP_even1k_fecal.txt')
w_unif_dm_fecal_agp = get_relative_new_path('weighted_unifrac_AGP_even1k_fecal.txt')

qiime_charts_config = get_relative_existing_path('metadata_charts.json')

fig5 = get_relative_new_path('fig5.pdf')

jobs = []

# get summary counts
agp_sample_count = count_samples(open(mapping_fp))
agp_seq_count = count_seqs(open(merged_sequences_full_length_fp))
agp_unique_participants = count_unique_participants(open(mapping_fp))
pgp_sample_count = count_samples(open(ag_100nt_m_massaged_fp), criteria={'IS_PGP': 'Yes'})
pgp_seq_count = count_seqs(open(bloom_filtered_100nt), subset=pgp_ids)
pgp_unique_participants = count_unique_participants(open(ag_100nt_m_massaged_fp), criteria={'IS_PGP': 'Yes'})

# from the GET2012 sampling which were not part of the American Gut
pgp_sample_count += 439
pgp_unique_participants += 86
pgp_seq_count += 9509776

# summarize taxa for the stacked taxa plots
sum_fecal_args = {'input':ag_fecal_t_1k_fp,
                  'output':fecal_tax_sum_1k,
                  'level':"2"}

sum_oral_args = {'input':ag_oral_t_1k_fp,
                  'output':oral_tax_sum_1k,
                  'level':"2"}

sum_skin_args = {'input':ag_skin_t_1k_fp,
                  'output':skin_tax_sum_1k,
                  'level':"2"}
jobs.append(submit(scripts['Summarize Taxa'] % sum_fecal_args))
jobs.append(submit(scripts['Summarize Taxa'] % sum_oral_args))
jobs.append(submit(scripts['Summarize Taxa'] % sum_skin_args))
res = wait_on(jobs)

jobs = []

# setup the stacked taxa plots commands
agplots_args = {'mapping': ag_100nt_m_massaged_fp,
                'taxa': None,
                'metadata_cat': 'SIMPLE_BODY_SITE',
                'metadata_val': None,
                'output_prefix': None,
                'key_taxa': key_taxa}

agplots_fecal = agplots_args.copy()
agplots_fecal['taxa'] = fecal_tax_sum_1k_phy
agplots_fecal['metadata_val'] = 'FECAL'
agplots_fecal['output_prefix'] = 'ag_plots_fecal_'
agplots_fecal['identified'] = 'fecal_identified.txt'

agplots_oral = agplots_args.copy()
agplots_oral['taxa'] = oral_tax_sum_1k_phy
agplots_oral['metadata_val'] = 'ORAL'
agplots_oral['output_prefix'] = 'ag_plots_oral_'
agplots_oral['identified'] = 'oral_identified.txt'

agplots_skin = agplots_args.copy()
agplots_skin['taxa'] = skin_tax_sum_1k_phy
agplots_skin['metadata_val'] = 'SKIN'
agplots_skin['output_prefix'] = 'ag_plots_skin_'
agplots_skin['identified'] = 'skin_identified.txt'

# setup the initial filtering for the beta diversity HMP/AGP comparison
filter_uwdm_to_fecal = {'input': fig1_bdiv,
                        'mapping': hmp_pgp_ag_gg_mm_fp,
                        'states': 'SIMPLE_BODY_SITE:FECAL',
                        'output': uw_unif_dm_fecal}
filter_wdm_to_fecal = {'input': w_bdiv_path,
                       'mapping': hmp_pgp_ag_gg_mm_fp,
                       'states': 'SIMPLE_BODY_SITE:FECAL',
                       'output': w_unif_dm_fecal}

# submit the stacked taxa plots, metadata summaries (QIIME Charts) and the initial beta diversity filtering commands
jobs.append(submit(scripts['AGPlots'] % agplots_fecal))
jobs.append(submit(scripts['AGPlots'] % agplots_oral))
jobs.append(submit(scripts['AGPlots'] % agplots_skin))
jobs.append(submit(scripts['QIIME Charts'] % {'input': qiime_charts_config}))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal))

res = wait_on(jobs)

jobs = []

# second round of beta diversity filtering into project specific results
filter_uwdm_to_fecal_hmp = {'input': uw_unif_dm_fecal,
                            'mapping': hmp_pgp_ag_gg_mm_fp,
                            'states': 'TITLE_ACRONYM:HMP',
                            'output': uw_unif_dm_fecal_hmp}

filter_uwdm_to_fecal_agp = {'input': uw_unif_dm_fecal,
                            'mapping': hmp_pgp_ag_gg_mm_fp,
                            'states': 'TITLE_ACRONYM:AGP',
                            'output': uw_unif_dm_fecal_agp}

filter_wdm_to_fecal_hmp = {'input': w_unif_dm_fecal,
                           'mapping': hmp_pgp_ag_gg_mm_fp,
                           'states': 'TITLE_ACRONYM:HMP',
                           'output': w_unif_dm_fecal_hmp}

filter_wdm_to_fecal_agp = {'input': w_unif_dm_fecal,
                           'mapping': hmp_pgp_ag_gg_mm_fp,
                           'states': 'TITLE_ACRONYM:AGP',
                           'output': w_unif_dm_fecal_agp}

jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal_hmp))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_fecal_agp))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal_hmp))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_wdm_to_fecal_agp))
res = wait_on(jobs)

# perform the beta diversity comparison of the HMP and AGP
bdiv_compare_w = {'inputs': ','.join([w_unif_dm_fecal_hmp, w_unif_dm_fecal_agp]),
                  'output': fig5 + '.dtest.pdf',
                  'labels': 'HMP,AGP',
                  'title': "'Beta diversity added by sampled microbial communities'",
                  'ylabel': "'Diversity (weighted UniFrac)'"}
res = wait_on(submit(scripts['Beta Diversity Comparison'] % bdiv_compare_w))

pc_path = lambda x: x.rsplit('.txt',1)[0] + '_pc.txt'
emp_path = lambda x: x.rsplit('.txt',1)[0] + '-emp'
emp_index = lambda x: os.path.join(x, 'index.html')

# filter beta diversity to just HMP and AGP
uw_unif_dm_hmp_and_agp = get_relative_new_path('unweighted_unifrac_HMP_AG_100nt_even1k.txt')
uw_unif_dm_pgp_and_agp = get_relative_new_path('unweighted_unifrac_PGP_AG_100nt_even1k.txt')
uw_unif_dm_gg_and_agp = get_relative_new_path('unweighted_unifrac_GG_AG_100nt_even1k.txt')

filter_uwdm_to_hmp_and_agp = {'input': fig1_bdiv,
                              'mapping': hmp_pgp_ag_gg_mm_fp,
                              'states': 'TITLE_ACRONYM:HMP,AGP',
                              'output': uw_unif_dm_hmp_and_agp}

filter_uwdm_to_pgp_and_agp = {'input': fig1_bdiv,
                              'mapping': hmp_pgp_ag_gg_mm_fp,
                              'states': 'TITLE_ACRONYM:AGP,PGP',
                              'output': uw_unif_dm_pgp_and_agp}
# just show US/Venezuela/Malawi
filter_uwdm_to_gg_and_agp = {'input': fig2_bdiv,
                              'mapping': hmp_pgp_ag_gg_mm_fp,
                              'states': "'COUNTRY:United States of America,Malawi,Venezuela'",
                              'output': uw_unif_dm_gg_and_agp}

jobs = []
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_hmp_and_agp))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_pgp_and_agp))
jobs.append(submit(scripts['Filter Distance Matrix by Metadata'] % filter_uwdm_to_gg_and_agp))
res = wait_on(jobs)

# compute principal coordinates 
uw_unif_pc_hmp_and_agp = pc_path(uw_unif_dm_hmp_and_agp)
uw_unif_pc_pgp_and_agp = pc_path(uw_unif_dm_pgp_and_agp)
uw_unif_pc_gg_and_agp = pc_path(uw_unif_dm_gg_and_agp)

hmp_agp_pc_cmd_args = {'input':uw_unif_dm_hmp_and_agp,
                       'output':uw_unif_pc_hmp_and_agp}
pgp_agp_pc_cmd_args = {'input':uw_unif_dm_pgp_and_agp,
                       'output':uw_unif_pc_pgp_and_agp}
gg_agp_pc_cmd_args = {'input':uw_unif_dm_gg_and_agp,
                      'output':uw_unif_pc_gg_and_agp}

jobs = []
jobs.append(submit(scripts['Principal Coordinates'] % hmp_agp_pc_cmd_args))
jobs.append(submit(scripts['Principal Coordinates'] % pgp_agp_pc_cmd_args))
jobs.append(submit(scripts['Principal Coordinates'] % gg_agp_pc_cmd_args))
res = wait_on(jobs)

# produce PCoAs
uw_unif_emp_hmp_and_agp = emp_path(uw_unif_pc_hmp_and_agp)
uw_unif_emp_pgp_and_agp = emp_path(uw_unif_pc_pgp_and_agp)
uw_unif_emp_gg_and_agp = emp_path(uw_unif_pc_gg_and_agp)

hmp_agp_emp_cmd_args = {'input':uw_unif_pc_hmp_and_agp, 
                        'output':uw_unif_emp_hmp_and_agp, 
                        'mapping':fig1_m_fp}
pgp_agp_emp_cmd_args = {'input':uw_unif_pc_pgp_and_agp, 
                        'output':uw_unif_emp_pgp_and_agp, 
                        'mapping':fig1_m_fp}
gg_agp_emp_cmd_args = {'input':uw_unif_pc_gg_and_agp, 
                       'output':uw_unif_emp_gg_and_agp, 
                       'mapping':fig2_m_fp}

jobs = []
jobs.append(submit(scripts['Make Emperor'] % hmp_agp_emp_cmd_args))
jobs.append(submit(scripts['Make Emperor'] % pgp_agp_emp_cmd_args))
jobs.append(submit(scripts['Make Emperor'] % gg_agp_emp_cmd_args))
res = wait_on(jobs)

FileLink(emp_index(uw_unif_emp_hmp_and_agp), result_html_prefix='<p>HMP and AGP, please save an SVG as "agp_hmp"</p>')

FileLink(emp_index(uw_unif_emp_pgp_and_agp), result_html_prefix='<p>PGP and AGP, please save an SVG as "agp_pgp"</p>')

FileLink(emp_index(uw_unif_emp_gg_and_agp), result_html_prefix='<p>GG and AGP, please save an SVG as "agp_gg_age" and "agp_gg_country"</p>')

# SVG to PDF
agp_hmp_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_hmp.svg')
agp_pgp_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_pgp.svg')
agp_gg_age_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_gg_age.svg')
agp_gg_country_svg = source_path(DOWNLOAD_DIRECTORY, 'agp_gg_country.svg')

check_file(agp_hmp_svg)
check_file(agp_pgp_svg)
check_file(agp_gg_age_svg)
check_file(agp_gg_country_svg)

agp_hmp_pdf = get_relative_new_path('agp_hmp.pdf')
agp_pgp_pdf = get_relative_new_path('agp_pgp.pdf')
agp_gg_age_pdf = get_relative_new_path('agp_gg_age.pdf')
agp_gg_country_pdf = get_relative_new_path('agp_gg_country.pdf')

agp_hmp_args = {'input': agp_hmp_svg, 'output': agp_hmp_pdf}
agp_pgp_args = {'input': agp_pgp_svg, 'output': agp_pgp_pdf}
agp_gg_age_args = {'input': agp_gg_age_svg, 'output': agp_gg_age_pdf}
agp_gg_country_args = {'input': agp_gg_country_svg, 'output': agp_gg_country_pdf}

jobs = []
jobs.append(submit(scripts['SVG to PDF'] % agp_hmp_args))
jobs.append(submit(scripts['SVG to PDF'] % agp_pgp_args))
jobs.append(submit(scripts['SVG to PDF'] % agp_gg_age_args))
jobs.append(submit(scripts['SVG to PDF'] % agp_gg_country_args))
res = wait_on(jobs)

macros_fp = get_relative_new_path('mod1_macros.tex')

# get the date and format for human consumption (e.g., January 1, 2014)
cur_date = datetime.datetime.now()
date_fmt = cur_date.strftime("%B %d, %Y")

# format the counts into comma separated (e.g., 1,234,567)
agp_samples_fmt = locale.format("%d", agp_sample_count, grouping=True)
agp_participants_fmt = locale.format("%d", agp_unique_participants, grouping=True)
agp_sequences_fmt = locale.format("%d", agp_seq_count, grouping=True)
pgp_samples_fmt = locale.format("%d", pgp_sample_count, grouping=True)
pgp_participants_fmt = locale.format("%d", pgp_unique_participants, grouping=True)
pgp_sequences_fmt = locale.format("%d", pgp_seq_count, grouping=True)

# build the macro template for the latex document
macro_template = ["% release date"]
macro_template.append("\def\\releaseDate{%s}" % date_fmt)
macro_template.append("% participants paragraph")
macro_template.append("\def\\numSamples{%s}" % agp_samples_fmt)
macro_template.append("\def\\numParticipants{%s}" % agp_participants_fmt)

# published studies
macro_template.append("% participants table")
macro_template.append("\def\hmpAge{Adults}")
macro_template.append("\def\hmpLocation{USA}")
macro_template.append("\def\hmpSamples{4,788}")
macro_template.append("\def\hmpParticipants{242}")
macro_template.append("\def\hmpSequences{36,797,226}")
macro_template.append("\def\ggAge{Adults,Children}")
macro_template.append("\def\ggLocation{Venezuela, Malawi, USA}")
macro_template.append("\def\ggSamples{531}")
macro_template.append("\def\ggParticipants{531}")
macro_template.append("\def\ggSequences{1,093,740,274}")

# growing studies
macro_template.append("\def\pgpAge{Adults}")
macro_template.append("\def\pgpLocation{USA}")
macro_template.append("\def\pgpSamples{%s}" % pgp_samples_fmt)
macro_template.append("\def\pgpParticipants{%s}" % pgp_participants_fmt)
macro_template.append("\def\pgpSequences{%s}" % pgp_sequences_fmt)
macro_template.append("\def\\agpAge{Adults, Children}")
macro_template.append("\def\\agpLocation{Global}")
macro_template.append("\def\\agpSamples{%s}" % agp_samples_fmt)
macro_template.append("\def\\agpParticipants{%s}" % agp_participants_fmt)
macro_template.append("\def\\agpSequences{%s}" % agp_sequences_fmt)

macro_template.append("% diversity figure")
macro_template.append("\def\\numParticipantsLowerEstimate{%s}" % agp_participants_fmt)

macros = open(macros_fp, 'w')
macros.write('\n'.join(macro_template))
macros.write('\n')
macros.close()

# summary static images
for_mod1 = ['ag_plots_fecal_legend.pdf',
            'ag_plots_fecal_stack.pdf',
            'ag_plots_oral_stack.pdf',
            'ag_plots_skin_stack.pdf',
            'agp_gg_age.pdf',
            'agp_gg_country.pdf',
            'agp_hmp.pdf',
            'agp_pgp.pdf',
            'fig2.pdf',
            'fig3.pdf',
            'fig5.pdf',
            'fig6_a.pdf',
            'fig6_b.pdf',
            'fig6_legend.pdf',
            's1.pdf',
            's2.pdf',
            's3.pdf',
            'logoshape.pdf',
            'legend_age.pdf',
            'legend_agp_gg.pdf',
            'legend_agp_hmp_pgp.pdf'
            ]

os.mkdir(get_relative_new_path('pdfs-mod1'))
for f in for_mod1:
    shutil.copy(get_relative_existing_path(f), get_relative_new_path('pdfs-mod1/'))
    
res = wait_on(submit(scripts['To PDF'] % {'path': working_dir, 'input': 'mod1_main.tex'}))