#!/usr/bin/env python # coding: utf-8 # # Data set and pre-processing # # The following procedure can be used to reproduce the results that are present in the article "Nucleotide excision repair is impaired by binding of transcription factors to DNA". # The following data set are **mandatory** to run the [scripts](Scripts_Execution.ipynb) for mutation and repair rate analysis. And, these files have to be placed inside the ***dataset*** directory # # * [Somatic mutations](#Somatic mutations) # * [Transcription factor binding sites](#TFBS) # * [DNase Hypersensitive sites](#DHS) # * [Other annotations](#others) # * [Excision repair data](#xrseq) #

1. Somatic mutations

# # The somatic mutations of different cancer types used in this analysis were obtained from [Fredriksson et al., 2014](http://www.ncbi.nlm.nih.gov/pubmed/25383969). # In[ ]: get_ipython().run_cell_magic('bash', '', '\n# create a directory for dataset\nmkdir dataset\n\n# create a directory called "mutations" inside dataset directory and place the mutation file\nmkdir dataset/mutations\n\n# the mutation file "mutations.tsv.gz" obtained was already filtered for mutations that overlap with \n# dbSNP (v138) entries. Further, we seperate single nucleotide substitutions with a minimum variant frequency\n# of 0.2 (as recommended by the authors of Fredriksson et al.,) for the following cancer types \n# SKCM, LUAD, LUSC, CRC, BRCA, BLCA and HNSC.\n\nfor ctype in SKCM LUAD LUSC BRCA BLCA HNSC CRC; do lctype=`echo $ctype | tr \'[:upper:]\' \'[:lower:]\'`; \\\n zgrep -w $ctype dataset/mutations/mutations.tsv.gz | awk \'$7>0.2\' | \\\n awk \'BEGIN{OFS="\\t";} {if(length($5)==1 && length($6)==1 && $5!~/-/ && $6!~/-/){print "chr"$3,$4,$4,$5,$6,$1}}\' | \\\n sort -k1,1 -k2,2n | gzip -c >dataset/mutations/${lctype}.txt.gz; done\n \n# compute the number of mutations per sample\nfor i in $(ls dataset/mutations/*.gz);do name=`echo $i | cut -d "/" -f 3 | cut -d "." -f 1`; \\\n zcat $i | cut -f 6 | sort | uniq -c | awk -v var=$name \'BEGIN{OFS="\\t";}{print var,$2,$1}\' | sort -n -k3,3; \\\n done >dataset/mutations/perSample_MutationCount.txt\n') # The above mutation files seperated per cancer type contain 6 columns delimited by tab (similar to BED format): # # # # # # # # #

column	description
1	chromosome name with prefix "chr"
2	mutation position (one-based)
3	mutation position (one-based)
4	Reference allele. A single letter: A, C, G or T (upper case)
5	Alternate allele. A single letter: A, C, G or T (upper case)
6	Sample identifier

# In[ ]: # example chr1 58553693 58553693 C T XXXX # The somatic mutations from the whole genome of normal human skin sample were obtained from [Martincorena et al., 2015](http://www.ncbi.nlm.nih.gov/pubmed/25999502) # In[ ]: get_ipython().run_cell_magic('bash', '', '\n# the mutation file "PD20399be_wg_caveman_annotated_with_dinucs.5cols" from Marincorena et al., has to be placed\n# inside dataset/mutations\n\n# extract only single nucleotide variants\ngrep -v sampleID dataset/mutations/PD20399be_wg_caveman_annotated_with_dinucs.5cols | \\\n awk \'BEGIN{OFS="\\t";} {if(length($4)==1 && length($5)==1){print "chr"$2,$3,$3,$4,$5;}}\' | gzip >dataset/mutations/eyelid.txt.gz\n\n# number of snv counts\nzcat dataset/mutations/eyelid.txt.gz | wc -l | sed "s/^/skcm\\tnormalSkin\\t/" >>dataset/mutations/perSample_MutationCount.txt\n') #

2. Transcription factor binding sites annotations

# # The genomic coordinates of the transcription factor binding sites used in the analysis can be obtained from http://bg.upf.edu/group/projects/tfbs/. # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/TFBS\nwget -r -nH --cut-dirs=3 --no-parent --reject="index.html*" -e robots=off http://bg.upf.edu/group/projects/tfbs/ \\\n -P dataset/TFBS\n') # The files are in BED format (hg19 assembly). # # # # # # #

column	description
1	chromosome name with prefix "chr"
2	chromosome start position (zero-based)
3	chromosome end position (one-based)
4	TF_motif_name

Data description

# # All these transcription factor binding sites (TFBS), i.e., TF motif match under ChIP-seq peak regions, were obtained from ENCODE. These comprised, #

binding sites of 109 transcription factors (TF) as used in Khurana et al., 2013.
ENCODE predicted binding sites of 52 transcription factors which are not supported by ChIP- # seq peaks obtained from ftp portal
In addition, we obtained the binding sites of 32 TFs used in Reijns et al., 2015

# # The binding sites were classified into active/inactive binding sites based on their overlap with the [DNase Hypersensitive sites](#DHS). Further, we seperate the binding sites, proximal (promoter regions upstream of TSS) and distal (>5kb away from TSS), based on their genomic location. # # # The different sets of genomic coordinates used for mutation rate analysis: # # # # # # # # # #

class	filename	description
active TFBS in proximal regions	proximalTFBS-DHS_\*.bed.gz	TFBS located in the promoter regions and overlap with DHSs
inactive TFBS in proximal regions	proximalTFBS-noDHS_\*.bed.gz	TFBS located in the promoter regions but do not overlap with DHSs
active TFBS in distal regions	distalTFBS-DHS_skcm.bed.gz #	TFBS located in the intergenic region with no annotated TSS within 5kb distance on either sides
bound/unbound TFBS in proximal regions	allTFBS_bound\.bed.gz, allTFBS_unbound\.bed.gz	TFBS located in the promoter regions and overlap with TF peak (bound) or not (unbound)
active TFBS in transcribed regions	allTFBS_DHS_skcm_templateStrand.bed.gz	TFBS in template strand of the gene
active TFBS in transcribed regions	allTFBS_DHS_skcm_nontemplateStrand.bed.gz	TFBS in non-template strand of the gene
active TFBS in promoter region	proximalTFBS-DHS_skcm_quartiles.bed.gz	TFBS are startified based on ChIP-seq read coverage as provided by Reijns et al., 2015

3. DNase Hypersensitive site (DHS)

# # Download the DHSs identified by Hotspot algorith (narrowPeaks in FDR 1%) from Epigenome roadmap portal. # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/DHS\nwget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{E059,E088,E028,E084}-DNase.hotspot.fdr0.01.peaks.bed.gz \\\n -P dataset/DHS \n\n# matching DHS for each cancer types\ndeclare -A arr=(["skcm"]="E059" ["luad"]="E088" ["lusc"]="E088" ["crc"]="E084" ["brca"]="E028")\n\n# save that information in a file\necho -e "#ctype\\troadmapID\\tfile" >dataset/DHS/DHS_list.txt\nfor i in ${!arr[@]};do echo -e "$i\\t${arr[$i]}\\t${arr[$i]}-DNase.hotspot.fdr0.01.peaks.bed.gz";done >>dataset/DHS/DHS_list.txt \n') #

4. Other annotations

# #

a. Promoter and coding regions

# # Download the genomic co-ordinates of promoter and coding sequences that are used in this analysis # In[ ]: get_ipython().run_cell_magic('bash', '', '\nwget http://bg.upf.edu/group/projects/tfbs/otherAnnotations/{promoter2500.bed.gz,cds.regions.gz} -P dataset\n') # As promoters, we considered the sequences upto 2.4kb upstream of TSS of all protein coding genes in GENCODE (v19). Promoter regions overlapping coding sequences (CDS) and UTRs were excluded. #

b. Genome sequence

# # The genome sequence of hg19 version in fasta and the co-ordinates (as used in Khurana et al., 2013) # In[ ]: get_ipython().run_cell_magic('bash', '', '\nmkdir dataset/genome\nwget http://bg.upf.edu/group/projects/tfbs/otherAnnotations/{human_g1k_v37.fasta.gz,hg19.genome} -P dataset/genome\ngzip -d dataset/genome/hg19.genome.gz\n') #

c. Mappability

# #

UCSC Browser blacklisted regions (Duke and DAC, http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeMapability/)
Unique mappability of sequencing reads ("CRG Alignability 36' Track", score == 1) (http://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability)