#!/usr/bin/env python # coding: utf-8 # # Data set and pre-processing # # The following procedure can be used to reproduce the results that are present in the article "Nucleotide excision repair is impaired by binding of transcription factors to DNA". # The following data set are **mandatory** to run the [scripts](Scripts_Execution.ipynb) for mutation and repair rate analysis. And, these files have to be placed inside the ***dataset*** directory # # * [Somatic mutations](#Somatic mutations) # * [Transcription factor binding sites](#TFBS) # * [DNase Hypersensitive sites](#DHS) # * [Other annotations](#others) # * [Excision repair data](#xrseq) #

1. Somatic mutations

# # The somatic mutations of different cancer types used in this analysis were obtained from [Fredriksson et al., 2014](http://www.ncbi.nlm.nih.gov/pubmed/25383969). # In[ ]: get_ipython().run_cell_magic('bash', '', '\n# create a directory for dataset\nmkdir dataset\n\n# create a directory called "mutations" inside dataset directory and place the mutation file\nmkdir dataset/mutations\n\n# the mutation file "mutations.tsv.gz" obtained was already filtered for mutations that overlap with \n# dbSNP (v138) entries. Further, we seperate single nucleotide substitutions with a minimum variant frequency\n# of 0.2 (as recommended by the authors of Fredriksson et al.,) for the following cancer types \n# SKCM, LUAD, LUSC, CRC, BRCA, BLCA and HNSC.\n\nfor ctype in SKCM LUAD LUSC BRCA BLCA HNSC CRC; do lctype=`echo $ctype | tr \'[:upper:]\' \'[:lower:]\'`; \\\n zgrep -w $ctype dataset/mutations/mutations.tsv.gz | awk \'$7>0.2\' | \\\n awk \'BEGIN{OFS="\\t";} {if(length($5)==1 && length($6)==1 && $5!~/-/ && $6!~/-/){print "chr"$3,$4,$4,$5,$6,$1}}\' | \\\n sort -k1,1 -k2,2n | gzip -c >dataset/mutations/${lctype}.txt.gz; done\n \n# compute the number of mutations per sample\nfor i in $(ls dataset/mutations/*.gz);do name=`echo $i | cut -d "/" -f 3 | cut -d "." -f 1`; \\\n zcat $i | cut -f 6 | sort | uniq -c | awk -v var=$name \'BEGIN{OFS="\\t";}{print var,$2,$1}\' | sort -n -k3,3; \\\n done >dataset/mutations/perSample_MutationCount.txt\n') # The above mutation files seperated per cancer type contain 6 columns delimited by tab (similar to BED format): # # # # # # # # #
columndescription
1chromosome name with prefix "chr"
2mutation position (one-based)
3mutation position (one-based)
4Reference allele. A single letter: A, C, G or T (upper case)
5Alternate allele. A single letter: A, C, G or T (upper case)
6Sample identifier
# In[ ]: # example chr1 58553693 58553693 C T XXXX # The somatic mutations from the whole genome of normal human skin sample were obtained from [Martincorena et al., 2015](http://www.ncbi.nlm.nih.gov/pubmed/25999502) # In[ ]: get_ipython().run_cell_magic('bash', '', '\n# the mutation file "PD20399be_wg_caveman_annotated_with_dinucs.5cols" from Marincorena et al., has to be placed\n# inside dataset/mutations\n\n# extract only single nucleotide variants\ngrep -v sampleID dataset/mutations/PD20399be_wg_caveman_annotated_with_dinucs.5cols | \\\n awk \'BEGIN{OFS="\\t";} {if(length($4)==1 && length($5)==1){print "chr"$2,$3,$3,$4,$5;}}\' | gzip >dataset/mutations/eyelid.txt.gz\n\n# number of snv counts\nzcat dataset/mutations/eyelid.txt.gz | wc -l | sed "s/^/skcm\\tnormalSkin\\t/" >>dataset/mutations/perSample_MutationCount.txt\n') #

2. Transcription factor binding sites annotations

# # The genomic coordinates of the transcription factor binding sites used in the analysis can be obtained from http://bg.upf.edu/group/projects/tfbs/. # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/TFBS\nwget -r -nH --cut-dirs=3 --no-parent --reject="index.html*" -e robots=off http://bg.upf.edu/group/projects/tfbs/ \\\n -P dataset/TFBS\n') # The files are in BED format (hg19 assembly). # # # # # # #
columndescription
1chromosome name with prefix "chr"
2chromosome start position (zero-based)
3chromosome end position (one-based)
4TF_motif_name
#

Data description

# # All these transcription factor binding sites (TFBS), i.e., TF motif match under ChIP-seq peak regions, were obtained from ENCODE. These comprised, # # # The binding sites were classified into active/inactive binding sites based on their overlap with the [DNase Hypersensitive sites](#DHS). Further, we seperate the binding sites, proximal (promoter regions upstream of TSS) and distal (>5kb away from TSS), based on their genomic location. # # # The different sets of genomic coordinates used for mutation rate analysis: # # # # # # # # # #
classfilenamedescription
active TFBS in proximal regions proximalTFBS-DHS_\*.bed.gz TFBS located in the promoter regions and overlap with DHSs
inactive TFBS in proximal regions proximalTFBS-noDHS_\*.bed.gzTFBS located in the promoter regions but do not overlap with DHSs
active TFBS in distal regions distalTFBS-DHS_skcm.bed.gz # TFBS located in the intergenic region with no annotated TSS within 5kb distance on either sides
bound/unbound TFBS in proximal regions allTFBS_bound\*.bed.gz, allTFBS_unbound\*.bed.gz TFBS located in the promoter regions and overlap with TF peak (bound) or not (unbound)
active TFBS in transcribed regions allTFBS_DHS_skcm_templateStrand.bed.gz TFBS in template strand of the gene
active TFBS in transcribed regions allTFBS_DHS_skcm_nontemplateStrand.bed.gz TFBS in non-template strand of the gene
active TFBS in promoter region proximalTFBS-DHS_skcm_quartiles.bed.gz TFBS are startified based on ChIP-seq read coverage as provided by Reijns et al., 2015
#

3. DNase Hypersensitive site (DHS)

# # Download the DHSs identified by Hotspot algorith (narrowPeaks in FDR 1%) from Epigenome roadmap portal. # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/DHS\nwget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{E059,E088,E028,E084}-DNase.hotspot.fdr0.01.peaks.bed.gz \\\n -P dataset/DHS \n\n# matching DHS for each cancer types\ndeclare -A arr=(["skcm"]="E059" ["luad"]="E088" ["lusc"]="E088" ["crc"]="E084" ["brca"]="E028")\n\n# save that information in a file\necho -e "#ctype\\troadmapID\\tfile" >dataset/DHS/DHS_list.txt\nfor i in ${!arr[@]};do echo -e "$i\\t${arr[$i]}\\t${arr[$i]}-DNase.hotspot.fdr0.01.peaks.bed.gz";done >>dataset/DHS/DHS_list.txt \n') #

4. Other annotations

# #

a. Promoter and coding regions

# # Download the genomic co-ordinates of promoter and coding sequences that are used in this analysis # In[ ]: get_ipython().run_cell_magic('bash', '', '\nwget http://bg.upf.edu/group/projects/tfbs/otherAnnotations/{promoter2500.bed.gz,cds.regions.gz} -P dataset\n') # As promoters, we considered the sequences upto 2.4kb upstream of TSS of all protein coding genes in GENCODE (v19). Promoter regions overlapping coding sequences (CDS) and UTRs were excluded. #

b. Genome sequence

# # The genome sequence of hg19 version in fasta and the co-ordinates (as used in Khurana et al., 2013) # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/genome\nwget http://bg.upf.edu/group/projects/tfbs/otherAnnotations/{human_g1k_v37.fasta.gz,hg19.genome} -P dataset/genome\ngzip -d dataset/genome/hg19.genome.gz\n') #

c. Mappability

# #