#!/usr/bin/env python
# coding: utf-8
# # Scripts for mutation and repair rate analysis
#
#
# The following procedure can be used to reproduce the results that are present in the article "Nucleotide excision repair is impaired by binding of transcription factors to DNA".
#
# Dependencies required to run the following scripts:
# * External programs
# * [bedtools (version >= 2.17)](http://bedtools.readthedocs.org)
# * [bg-qmap](http://bitbucket.org/bgframework/bgqmap)
#
# * Data set and config file
#
# * Download all the required data set that are mentioned in [Dataset_and_Preprocessing](Dataset_and_Preprocessing.ipynb) notebook.
# * set path information to variables in the config file inside scripts folder
#
# we highly recommend to run the following commands in your terminal.
# ### Summary of different analysis perfomed:
#
# * [Mutation/repair rate in proximal TFBS](#proximalTFBS)
# * [Mutation/repair rate in distal TFBS](#distalTFBS)
# * [Mutation/repair rate in TFBS seperated by binding strength](#bindStrength)
# * [Mutation rate in a subset of bound and unbound TFBS](#unbound)
# * [Mutation/repair rate in TFBS downstream of TSS](#transcribed)
# * [Mutation/repair rate in DHS centered regions](#dhsCentered)
# 1. Mutation/repair rate in proximal TFBS
#
# The results obtained from the following analysis are used to plot Figure 1, Figure 3a, Extended Data Fig. 2 and 4.
#
1.1. Compute mutation/repair rate for each TF and combined all TFs togethers
# In[ ]:
# generate an input file with list of individual TFs to run
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in DHS noDHS; \
do echo -e "-m $motif -c skcm -t proximal -a $atype"; done;done >tmp/alltfbs_mutRate_proximal.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximal
# if you would like to run without bg-qmap (parallel), then try the following
# while read line;do scripts/tfbsMutationRate.sh $line;done < tmp/alltfbs_mutRate_proximal.txt
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t proximal
# collect the results for individual TFs and all TFs together in a metafile
for atype in DHS noDHS;do scripts/get_results_tfbsMutationRate.pl -c skcm -t proximal -a $atype;done
# compute background mutation rate
# first, compute the mutational probabilities for each tri-nucleotide context
scripts/getSignature.sh
# the above file will generate "signature_probabilities.tsv" in the dataset/mutations/ folder which will be
# used for the randomization in the following step.
# please note that the following script is time consuming
bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximalBack
# collect background scores in a metafile.
for atype in DHS noDHS; do \
scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-proximal -a $atype -o metafiles/tfbs-proximal;done
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proxRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t proximal
# collect the results for individual TFs and all TFs together in a metafile
scripts/get_results_tfbsXRseq.pl -c skcm -t proximal -a DHS -o metafiles/tfbs-proximal
# 1.2. Per sample analysis
# In[ ]:
# for normal skin
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in DHS noDHS;do echo -e "-m $motif -c eyelid -t proximal -a $atype";done;done >tmp/eyelid.txt
# map mutaions to each TF motifs
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/eyelid.txt -o tmp/output -n eyelid
# combine allTFs together
scripts/all_tfbsMutationRate_unique.sh -c eyelid -t proximal
# In[ ]:
# prepare the files ready for the other 38 SKCM samples
for atype in DHS noDHS;do scripts/prepare_perSample.sh -c skcm -t proximal -a $atype;done
# remap mutation per sample wise and generate a metafile with results.
for atype in DHS noDHS;do scripts/persample.sh $atype;done
# In[ ]:
# perform enrichment analysis for perTF and perSample analysis
# create output directory
mkdir results/metafiles/enrichmentAnalysis
# script to perform enrichment analysis
python scripts/enrichment_analysis.py
# 1.3. Per mutation type analysis
# In[ ]:
# prepare the file
scripts/mutationTypes_allSamples.sh -c skcm -t proximal
# 2. Mutation/repair rate in distal TFBS
#
# The results obtained from the following analysis are using to plot Figure 2a.
# In[ ]:
# generate an input file with list of individual TFs to run
for motif in $(cat dataset/TFBS/distalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do echo -e "-m $motif -c skcm -t distal -a DHS";done >tmp/alltfbs_mutRate_distal.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distal
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t distal
# collect results in a metafile
scripts/get_results_tfbsMutationRate.pl -c skcm -t distal -a DHS
# compute background mutaiton
# check "signature_probabilities.tsv" file is available in the dataset/mutations/ folder, if not please run
# the scripts/getSignature.sh as mentioned in above section 1.1
# please note that the following script is time consuming
bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalBack
# collect background scores in a metafile.
for atype in DHS noDHS; do \
scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-distal -a DHS -o metafiles/tfbs-distal;done
# for distal regions map nucleosome data
bg-qmap -c scripts/per_tfbsCentered_nucl.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalNucl
# get unique counts for all TFs together
scripts/all_tfbsCentered_nucl_unique.sh -c skcm -t distal
# copy the final results to plot
cp results/tfbs-distal/skcm/nucleosome/allTFs_skcm.csv results/metafiles/tfbs-distal/allTFs_skcm_DHS_nucl.csv
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t distal
# prepare a metafile file
scripts/get_results_tfbsXRseq.pl -c skcm -t distal -a DHS -o metafiles/tfbs-distal
# 3. Mutation/repair rate in TFBS seperated by binding strength
#
# The results obtained from the following analysis are using to plot Figure 3b and Extended Data Fig. 5.
# In[ ]:
# generate the input file
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm_quartiles.bed.gz | cut -f 4 | sort -u | grep -v motif); \
do echo -e "-m $motif -c skcm -t bindStrength -a DHS";done;done >tmp/bindStrength.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bindStrength.txt -o tmp/output -n bindStrength
# combine results for all TFs
scripts/all_tfbsMutationRate_unique.sh -c skcm -t bindStrength
# collect results in a metafile
scripts/get_results_tfbsMutationRate.pl -c skcm -t bindStrength -a DHS
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/bindStrength -o tmp/output -n bindStrengthRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t bindStrength
# prepare a metafile file
scripts/get_results_tfbsXRseq.pl -c skcm -t bindStrength -a DHS -o metafiles/tfbs-bs-seperated
# 4. Mutation rate in a subset of bound and unbound TFBS
#
# The results obtained from the following analysis are using to plot Extended Data Fig 1.
# In[ ]:
# gernate the input file
for motif in $(cat dataset/TFBS/allTFBS_boundDHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \
do echo -e "-m $motif -c skcm -t bound-unbound -a $atype";done;done >tmp/bound-unbound.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bound-unbound.txt -o tmp/toutput -n bound
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t bound-unbound
# get results in a metafile
for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \
do scripts/get_results_tfbsMutationRate.pl -c skcm -t bound-unbound -a $atype;done
# 5. Mutation/repair rate in TFBS downstream of TSS
#
# The results obtained from the following analysis are using to plot Extended Data Fig. 7.
# In[ ]:
# generate the input file
for motif in $(zcat dataset/TFBS/allTFBS_DHS_skcm_templateStrand.bed.gz \
dataset/TFBS/allTFBS_DHS_skcm_nontemplateStrand.bed.gz | cut -f 4 | sort -u ); \
do for strand in nontemplateStrand templateStrand;do \
echo -e "-m $motif -c skcm -t transcribed -a $strand";done;done >/tmp/transcribed.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/transcribed.txt -o tmp/output -n transcribed
# combine results for all TFs together
sh scripts/all_tfbsMutationRate_unique.sh -c skcm -t transcribed
# collect results in a metafile
for atype in templateStrand nontemplateStrand; \
do scripts/get_results_tfbsMutationRate.pl -c skcm -t transcribed -a $i;done
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/transcribed.txt -o tmp/output -n transcribedRepair
# make unqiue count for all TFs together
sh scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t transcribed
# collect all results
for atype in templateStrand nontemplateStrand;do \
perl scripts/get_results_tfbsXRseq.pl -c skcm -t transcribed -a $atype -o metafiles/tfbs-tss-downstream/;done
# 6. Mutation/repair rate in DHS centered regions
#
# The results obtained from the following analysis is used to plot Figure 2b, Extended Data Fig. 3 and 6.
# In[ ]:
# generate the list of subset to analyse
for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \
DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \
do echo "-c skcm -a $atype";done >tmp/dhsCentered.txt
# run it on parallel for each of them
# please note in the below command we mentioned to use 4 cores for each of the analysis.
bg-qmap -c scripts/dhsMutationRate.sh -m tmp/dhsCentered.txt -o tmp/output -n dhs --cores=4
# collect results in a metafile
scripts/get_results_dhsMutationRate.pl -c skcm
# map XR-seq excision repair data for each DHS centered analysis
while read line;do echo -e "$line -t dhsCentered -m DHS";done < tmp/dhsCentered.txt >tmp/dhsRepairMap.txt
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/dhsRepairMap.txt -o tmp/output -n dhsRepairMap
# collect results in a metafile
for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \
DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \
do scripts/get_results_dhsXRseq.pl -c skcm -t dhsCentered -a $atype -o metafiles/dhsCentered;done