The following procedure can be used to reproduce the results that are present in the article "Nucleotide excision repair is impaired by binding of transcription factors to DNA".
Dependencies required to run the following scripts:
External programs
Data set and config file
we highly recommend to run the following commands in your terminal.
1. Mutation/repair rate in proximal TFBS
The results obtained from the following analysis are used to plot Figure 1, Figure 3a, Extended Data Fig. 2 and 4.
# generate an input file with list of individual TFs to run
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in DHS noDHS; \
do echo -e "-m $motif -c skcm -t proximal -a $atype"; done;done >tmp/alltfbs_mutRate_proximal.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximal
# if you would like to run without bg-qmap (parallel), then try the following
# while read line;do scripts/tfbsMutationRate.sh $line;done < tmp/alltfbs_mutRate_proximal.txt
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t proximal
# collect the results for individual TFs and all TFs together in a metafile
for atype in DHS noDHS;do scripts/get_results_tfbsMutationRate.pl -c skcm -t proximal -a $atype;done
# compute background mutation rate
# first, compute the mutational probabilities for each tri-nucleotide context
scripts/getSignature.sh
# the above file will generate "signature_probabilities.tsv" in the dataset/mutations/ folder which will be
# used for the randomization in the following step.
# please note that the following script is time consuming
bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proximalBack
# collect background scores in a metafile.
for atype in DHS noDHS; do \
scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-proximal -a $atype -o metafiles/tfbs-proximal;done
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_proximal.txt -o tmp/output -n proxRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t proximal
# collect the results for individual TFs and all TFs together in a metafile
scripts/get_results_tfbsXRseq.pl -c skcm -t proximal -a DHS -o metafiles/tfbs-proximal
# for normal skin
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in DHS noDHS;do echo -e "-m $motif -c eyelid -t proximal -a $atype";done;done >tmp/eyelid.txt
# map mutaions to each TF motifs
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/eyelid.txt -o tmp/output -n eyelid
# combine allTFs together
scripts/all_tfbsMutationRate_unique.sh -c eyelid -t proximal
# prepare the files ready for the other 38 SKCM samples
for atype in DHS noDHS;do scripts/prepare_perSample.sh -c skcm -t proximal -a $atype;done
# remap mutation per sample wise and generate a metafile with results.
for atype in DHS noDHS;do scripts/persample.sh $atype;done
# perform enrichment analysis for perTF and perSample analysis
# create output directory
mkdir results/metafiles/enrichmentAnalysis
# script to perform enrichment analysis
python scripts/enrichment_analysis.py
# prepare the file
scripts/mutationTypes_allSamples.sh -c skcm -t proximal
2. Mutation/repair rate in distal TFBS
The results obtained from the following analysis are using to plot Figure 2a.
# generate an input file with list of individual TFs to run
for motif in $(cat dataset/TFBS/distalTFBS-DHS_skcm.bed.gz | cut -f 4 | sort -u); \
do echo -e "-m $motif -c skcm -t distal -a DHS";done >tmp/alltfbs_mutRate_distal.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distal
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t distal
# collect results in a metafile
scripts/get_results_tfbsMutationRate.pl -c skcm -t distal -a DHS
# compute background mutaiton
# check "signature_probabilities.tsv" file is available in the dataset/mutations/ folder, if not please run
# the scripts/getSignature.sh as mentioned in above section 1.1
# please note that the following script is time consuming
bg-qmap -c scripts/tfbsBackMutationRate.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalBack
# collect background scores in a metafile.
for atype in DHS noDHS; do \
scripts/get_results_tfbsBackgMutationRate.py -c skcm -t tfbs-distal -a DHS -o metafiles/tfbs-distal;done
# for distal regions map nucleosome data
bg-qmap -c scripts/per_tfbsCentered_nucl.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalNucl
# get unique counts for all TFs together
scripts/all_tfbsCentered_nucl_unique.sh -c skcm -t distal
# copy the final results to plot
cp results/tfbs-distal/skcm/nucleosome/allTFs_skcm.csv results/metafiles/tfbs-distal/allTFs_skcm_DHS_nucl.csv
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/alltfbs_mutRate_distal.txt -o tmp/output -n distalRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t distal
# prepare a metafile file
scripts/get_results_tfbsXRseq.pl -c skcm -t distal -a DHS -o metafiles/tfbs-distal
3. Mutation/repair rate in TFBS seperated by binding strength
The results obtained from the following analysis are using to plot Figure 3b and Extended Data Fig. 5.
# generate the input file
for motif in $(zcat dataset/TFBS/proximalTFBS-DHS_skcm_quartiles.bed.gz | cut -f 4 | sort -u | grep -v motif); \
do echo -e "-m $motif -c skcm -t bindStrength -a DHS";done;done >tmp/bindStrength.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bindStrength.txt -o tmp/output -n bindStrength
# combine results for all TFs
scripts/all_tfbsMutationRate_unique.sh -c skcm -t bindStrength
# collect results in a metafile
scripts/get_results_tfbsMutationRate.pl -c skcm -t bindStrength -a DHS
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/bindStrength -o tmp/output -n bindStrengthRepair
# combine results for all TFs together
scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t bindStrength
# prepare a metafile file
scripts/get_results_tfbsXRseq.pl -c skcm -t bindStrength -a DHS -o metafiles/tfbs-bs-seperated
4. Mutation rate in a subset of bound and unbound TFBS
The results obtained from the following analysis are using to plot Extended Data Fig 1.
# gernate the input file
for motif in $(cat dataset/TFBS/allTFBS_boundDHS_skcm.bed.gz | cut -f 4 | sort -u); \
do for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \
do echo -e "-m $motif -c skcm -t bound-unbound -a $atype";done;done >tmp/bound-unbound.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/bound-unbound.txt -o tmp/toutput -n bound
# combine results for all TFs together
scripts/all_tfbsMutationRate_unique.sh -c skcm -t bound-unbound
# get results in a metafile
for atype in boundDHS boundNoDHS unboundNoDHS unboundNoDHSSel; \
do scripts/get_results_tfbsMutationRate.pl -c skcm -t bound-unbound -a $atype;done
5. Mutation/repair rate in TFBS downstream of TSS
The results obtained from the following analysis are using to plot Extended Data Fig. 7.
# generate the input file
for motif in $(zcat dataset/TFBS/allTFBS_DHS_skcm_templateStrand.bed.gz \
dataset/TFBS/allTFBS_DHS_skcm_nontemplateStrand.bed.gz | cut -f 4 | sort -u ); \
do for strand in nontemplateStrand templateStrand;do \
echo -e "-m $motif -c skcm -t transcribed -a $strand";done;done >/tmp/transcribed.txt
# map observed mutations for each TF
bg-qmap -c scripts/tfbsMutationRate.sh -m tmp/transcribed.txt -o tmp/output -n transcribed
# combine results for all TFs together
sh scripts/all_tfbsMutationRate_unique.sh -c skcm -t transcribed
# collect results in a metafile
for atype in templateStrand nontemplateStrand; \
do scripts/get_results_tfbsMutationRate.pl -c skcm -t transcribed -a $i;done
# map XR-seq excision repair data for individual TFs
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/transcribed.txt -o tmp/output -n transcribedRepair
# make unqiue count for all TFs together
sh scripts/all_tfbsCentered_xrseq_unique.sh -c skcm -t transcribed
# collect all results
for atype in templateStrand nontemplateStrand;do \
perl scripts/get_results_tfbsXRseq.pl -c skcm -t transcribed -a $atype -o metafiles/tfbs-tss-downstream/;done
6. Mutation/repair rate in DHS centered regions
The results obtained from the following analysis is used to plot Figure 2b, Extended Data Fig. 3 and 6.
# generate the list of subset to analyse
for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \
DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \
do echo "-c skcm -a $atype";done >tmp/dhsCentered.txt
# run it on parallel for each of them
# please note in the below command we mentioned to use 4 cores for each of the analysis.
bg-qmap -c scripts/dhsMutationRate.sh -m tmp/dhsCentered.txt -o tmp/output -n dhs --cores=4
# collect results in a metafile
scripts/get_results_dhsMutationRate.pl -c skcm
# map XR-seq excision repair data for each DHS centered analysis
while read line;do echo -e "$line -t dhsCentered -m DHS";done < tmp/dhsCentered.txt >tmp/dhsRepairMap.txt
bg-qmap -c scripts/map_xrseq_both.sh -m tmp/dhsRepairMap.txt -o tmp/output -n dhsRepairMap
# collect results in a metafile
for atype in DHS_all DHS_Promoter_noTFBS DHS_Promoter_predTFBS DHS_Promoter_predTFBSAll \
DHS_Promoter_TFBS DHS_noPromoter_noTFBS DHS_noPromoter_predTFBS DHS_noPromoter_predTFBSAll DHS_noPromoter_TFBS; \
do scripts/get_results_dhsXRseq.pl -c skcm -t dhsCentered -a $atype -o metafiles/dhsCentered;done