#!/usr/bin/env python # coding: utf-8 # # Scripts for mutation and repair rate analysis # # # The following procedure can be used to reproduce the results that are present in the article "Nucleotide excision repair is impaired by binding of transcription factors to DNA". # # Dependencies required to run the following scripts: # * External programs # * [bedtools (version >= 2.17)](http://bedtools.readthedocs.org) # * [bg-qmap](http://bitbucket.org/bgframework/bgqmap) # # * Data set and config file # # * Download all the required data set that are mentioned in [Dataset_and_Preprocessing](Dataset_and_Preprocessing.ipynb) notebook. # * set path information to variables in the config file inside scripts folder # # we highly recommend to run the following commands in your terminal. # ### Summary of different analysis perfomed: # # * [Mutation/repair rate in proximal TFBS](#proximalTFBS) # * [Mutation/repair rate in distal TFBS](#distalTFBS) # * [Mutation/repair rate in TFBS seperated by binding strength](#bindStrength) # * [Mutation rate in a subset of bound and unbound TFBS](#unbound) # * [Mutation/repair rate in TFBS downstream of TSS](#transcribed) # * [Mutation/repair rate in DHS centered regions](#dhsCentered) #

1. Mutation/repair rate in proximal TFBS

# # The results obtained from the following analysis are used to plot Figure 1, Figure 3a, Extended Data Fig. 2 and 4. #

1.1. Compute mutation/repair rate for each TF and combined all TFs togethers

# In[ ]: # generate an input file with list of individual TFs to run for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \ do for atype in DHS noDHS; \ do echo -e "-m $motif -c skcm -t proximal -a $atype"; done;done >tmp/alltfbs_mutRate_proximal.txt # map observed mutations for each TF bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximal # if you would like to run without bg-qmap (parallel), then try the following # while read line;do scripts/tfbsMutationRate.sh $line;done < tmp/alltfbs_mutRate_proximal.txt # combine results for all TFs together scripts/all_tfbsMutationRate_unique.sh -c skcm -t proximal # collect the results for individual TFs and all TFs together in a metafile for atype in DHS noDHS;do scripts/get_results_tfbsMutationRate.pl -c skcm -t proximal -a $atype;done # compute background mutation rate # first, compute the mutational probabilities for each tri-nucleotide context scripts/getSignature.sh # the above file will generate "signature_probabilities.tsv" in the dataset/mutations/ folder which will be # used for the randomization in the following step. # please note that the following script is time consuming bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximalBack # collect background scores in a metafile. for atype in DHS noDHS; do \ scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-proximal -a $atype -o metafiles/tfbs-proximal;done # map XR-seq excision repair data for individual TFs bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proxRepair # combine results for all TFs together scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t proximal # collect the results for individual TFs and all TFs together in a metafile scripts/get_results_tfbsXRseq.pl -c skcm -t proximal -a DHS -o metafiles/tfbs-proximal #

1.2. Per sample analysis

# In[ ]: # for normal skin for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \ do for atype in DHS noDHS;do echo -e "-m $motif -c eyelid -t proximal -a $atype";done;done >tmp/eyelid.txt # map mutaions to each TF motifs bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/eyelid.txt -o tmp/output -n eyelid # combine allTFs together scripts/all_tfbsMutationRate_unique.sh -c eyelid -t proximal # In[ ]: # prepare the files ready for the other 38 SKCM samples for atype in DHS noDHS;do scripts/prepare_perSample.sh -c skcm -t proximal -a $atype;done # remap mutation per sample wise and generate a metafile with results. for atype in DHS noDHS;do scripts/persample.sh $atype;done # In[ ]: # perform enrichment analysis for perTF and perSample analysis # create output directory mkdir results/metafiles/enrichmentAnalysis # script to perform enrichment analysis python scripts/enrichment_analysis.py #

1.3. Per mutation type analysis

# In[ ]: # prepare the file scripts/mutationTypes_allSamples.sh -c skcm -t proximal #

2. Mutation/repair rate in distal TFBS

# # The results obtained from the following analysis are using to plot Figure 2a. # In[ ]: # generate an input file with list of individual TFs to run for motif in $(cat dataset/TFBS/distalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \ do echo -e "-m $motif -c skcm -t distal -a DHS";done >tmp/alltfbs_mutRate_distal.txt # map observed mutations for each TF bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distal # combine results for all TFs together scripts/all_tfbsMutationRate_unique.sh -c skcm -t distal # collect results in a metafile scripts/get_results_tfbsMutationRate.pl -c skcm -t distal -a DHS # compute background mutaiton # check "signature_probabilities.tsv" file is available in the dataset/mutations/ folder, if not please run # the scripts/getSignature.sh as mentioned in above section 1.1 # please note that the following script is time consuming bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalBack # collect background scores in a metafile. for atype in DHS noDHS; do \ scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-distal -a DHS -o metafiles/tfbs-distal;done # for distal regions map nucleosome data bg-qmap -c scripts/per_tfbsCentered_nucl.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalNucl # get unique counts for all TFs together scripts/all_tfbsCentered_nucl_unique.sh -c skcm -t distal # copy the final results to plot cp results/tfbs-distal/skcm/nucleosome/allTFs_skcm.csv results/metafiles/tfbs-distal/allTFs_skcm_DHS_nucl.csv # map XR-seq excision repair data for individual TFs bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalRepair # combine results for all TFs together scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t distal # prepare a metafile file scripts/get_results_tfbsXRseq.pl -c skcm -t distal -a DHS -o metafiles/tfbs-distal #

3. Mutation/repair rate in TFBS seperated by binding strength

# # The results obtained from the following analysis are using to plot Figure 3b and Extended Data Fig. 5. # In[ ]: # generate the input file for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm_quartiles.bed.gz | cut -f 4 | sort -u | grep -v motif); \ do echo -e "-m $motif -c skcm -t bindStrength -a DHS";done;done >tmp/bindStrength.txt # map observed mutations for each TF bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bindStrength.txt -o tmp/output -n bindStrength # combine results for all TFs scripts/all_tfbsMutationRate_unique.sh -c skcm -t bindStrength # collect results in a metafile scripts/get_results_tfbsMutationRate.pl -c skcm -t bindStrength -a DHS # map XR-seq excision repair data for individual TFs bg-qmap -c scripts/map_xrseq_both.sh -m tmp/bindStrength -o tmp/output -n bindStrengthRepair # combine results for all TFs together scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t bindStrength # prepare a metafile file scripts/get_results_tfbsXRseq.pl -c skcm -t bindStrength -a DHS -o metafiles/tfbs-bs-seperated #

4. Mutation rate in a subset of bound and unbound TFBS

# # The results obtained from the following analysis are using to plot Extended Data Fig 1. # In[ ]: # gernate the input file for motif in $(cat dataset/TFBS/allTFBS_boundDHS_skcm.bed.gz | cut -f 4 | sort -u); \ do for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \ do echo -e "-m $motif -c skcm -t bound-unbound -a $atype";done;done >tmp/bound-unbound.txt # map observed mutations for each TF bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bound-unbound.txt -o tmp/toutput -n bound # combine results for all TFs together scripts/all_tfbsMutationRate_unique.sh -c skcm -t bound-unbound # get results in a metafile for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \ do scripts/get_results_tfbsMutationRate.pl -c skcm -t bound-unbound -a $atype;done #

5. Mutation/repair rate in TFBS downstream of TSS

# # The results obtained from the following analysis are using to plot Extended Data Fig. 7. # In[ ]: # generate the input file for motif in $(zcat dataset/TFBS/allTFBS_DHS_skcm_templateStrand.bed.gz \ dataset/TFBS/allTFBS_DHS_skcm_nontemplateStrand.bed.gz | cut -f 4 | sort -u ); \ do for strand in nontemplateStrand templateStrand;do \ echo -e "-m $motif -c skcm -t transcribed -a $strand";done;done >/tmp/transcribed.txt # map observed mutations for each TF bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/transcribed.txt -o tmp/output -n transcribed # combine results for all TFs together sh scripts/all_tfbsMutationRate_unique.sh -c skcm -t transcribed # collect results in a metafile for atype in templateStrand nontemplateStrand; \ do scripts/get_results_tfbsMutationRate.pl -c skcm -t transcribed -a $i;done # map XR-seq excision repair data for individual TFs bg-qmap -c scripts/map_xrseq_both.sh -m tmp/transcribed.txt -o tmp/output -n transcribedRepair # make unqiue count for all TFs together sh scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t transcribed # collect all results for atype in templateStrand nontemplateStrand;do \ perl scripts/get_results_tfbsXRseq.pl -c skcm -t transcribed -a $atype -o metafiles/tfbs-tss-downstream/;done #

6. Mutation/repair rate in DHS centered regions

# # The results obtained from the following analysis is used to plot Figure 2b, Extended Data Fig. 3 and 6. # In[ ]: # generate the list of subset to analyse for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \ DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \ do echo "-c skcm -a $atype";done >tmp/dhsCentered.txt # run it on parallel for each of them # please note in the below command we mentioned to use 4 cores for each of the analysis. bg-qmap -c scripts/dhsMutationRate.sh -m tmp/dhsCentered.txt -o tmp/output -n dhs --cores=4 # collect results in a metafile scripts/get_results_dhsMutationRate.pl -c skcm # map XR-seq excision repair data for each DHS centered analysis while read line;do echo -e "$line -t dhsCentered -m DHS";done < tmp/dhsCentered.txt >tmp/dhsRepairMap.txt bg-qmap -c scripts/map_xrseq_both.sh -m tmp/dhsRepairMap.txt -o tmp/output -n dhsRepairMap # collect results in a metafile for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \ DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \ do scripts/get_results_dhsXRseq.pl -c skcm -t dhsCentered -a $atype -o metafiles/dhsCentered;done