#!/usr/bin/env python
# coding: utf-8
# # Example of reproducible analysis pipeline
# We are going to reproduce a couple of figures (2A and 8A) of the following paper:
# In[1]:
from IPython.display import IFrame
url = "http://www.ncbi.nlm.nih.gov/pubmed/22219510"
IFrame(url, 800, 400)
# I select this paper because:
# * The article is open access
# * The reference organism is yeast (12Mb)
# * The data are freely available
# * I know it very well...
# ## Required software:
# * [SRA](http://www.ncbi.nlm.nih.gov/Traces/sra/?view=software): The Sequence Read Archive (SRA) stores raw sequencing data from the next generation of sequencing platforms including Roche 454 GS System®, Illumina Genome Analyzer®, Applied Biosystems SOLiD® System, Helicos Heliscope®, Complete Genomics®, and Pacific Biosciences SMRT®. **In the analysis version: 2.3.5-2 was used.**
#
#
# * [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml): Bowtie is an ultrafast, memory-efficient short read aligner. It aligns short DNA sequences (reads) to the human genome at a rate of over 25 million 35-bp reads per hour. Bowtie indexes the genome with a Burrows-Wheeler index to keep its memory footprint small: typically about 2.2 GB for the human genome (2.9 GB for paired-end). **In the analysis version 1.0.1 was used.**
# ## Required Python libraries:
# This is extracted from the command `pip freeze`. On the right of each library there is the version I have been using in the analysis:
# * ipython==3.1.0
# * numpy==1.9.2
# * matplotlib==1.4.3
# * pandas==0.16.1
# * Cython==0.22
# * weblogo==3.4
# * scipy==0.15.1
# ## Import modules
# In[2]:
get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd
import tarfile
import urllib
import os
import weblogolib
import random
import seaborn as sns
# ## Define the paths of the required software
# In[3]:
SRA = "bin/fastq-dump"
BOWTIE = "bin"
# # PIPELINE
# ## Download the data
# In[4]:
# Download one of the set of raw reads of the paper
sra_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/" + \
"SRP%2FSRP009%2FSRP009462/SRR384975/SRR384975.sra"
sra_file = "reads/SRR384975.sra"
# Download the yeast genomic sequences. The release used in this analysis is: R64-1-1
yeast_url = "http://downloads.yeastgenome.org/sequence/S288C_reference/" + \
"genome_releases/S288C_reference_genome_Current_Release.tgz"
yeast_file = "data/yeast_genome.tgz"
genome_file = "data/yeast_genome.fa"
# Download the yeast genomic annotations
features_url = "http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.tab"
features_file = "data/yeast_features.tsv"
# In[5]:
# Create the directories "reads" and "data"
folders = ["data", "reads", "index", "sequences", "figures"]
_ = [os.mkdir(folder) for folder in folders if not os.path.isdir(folder)]
# In[6]:
get_ipython().run_cell_magic('time', '', '\n# Download the files and save it into the local directory\nfor (get_url, get_file) in zip([sra_url, yeast_url, features_url], \n [sra_file, yeast_file, features_file]):\n if not os.path.isfile(get_file):\n urllib.request.urlretrieve(get_url, get_file)\n')
# In[7]:
get_ipython().system('ls')
# ## Extract the SRA file
# In[8]:
get_ipython().system('time $SRA $sra_file --outdir reads')
# ## Extract the sequence of the yeast genome
# In[9]:
tar = tarfile.open(yeast_file)
tar.getnames()
# The right file to extract is **S288C_reference_sequence_R64-2-1_20150113.fsa**
# In[10]:
# Extract the file
tar.extract('S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa',
'data')
# In[11]:
get_ipython().system('ls data/')
# In[12]:
# Rename the reference_sequence file
get_ipython().system('mv data/S288C_reference_genome_R64-2-1_20150113/S288C_reference_sequence_R64-2-1_20150113.fsa $genome_file')
get_ipython().system('rm -r data/S288C_reference_genome_R64-2-1_20150113/')
#!rm $yeast_file
# In[13]:
get_ipython().system('ls data/')
# ## Build a bowtie index
# In[14]:
index_name = "index/yeast_genome"
get_ipython().system('$BOWTIE/bowtie-build -q $genome_file $index_name')
# ## Trim and align the reads to the yeast genome
# The reads should start with the `Ty1 LTR` (Long Terminal Repeat), so I check for its presence. I also get rid of reads with _undefined (N)_ nucleotides. In the alignment step, in order to align only the genomic sequence downstream the insertion site, I trim the first 10bp.
# In[15]:
get_ipython().run_cell_magic('time', '', '\n# Reads should start with the Ty1 LTR ("TATT"). \n# I also discard those reads that have ambiguous nucleotides.\nfastq_file = os.path.splitext(sra_file)[0] + ".fastq"\ntrimmed_file = os.path.splitext(sra_file)[0] + ".trimmed"\n\nLTR = "TATT"\nMAX_MISMATCHES = 0\n\n\nclass ReadFastq:\n def load(self, line):\n self.lines = [line]\n self.output = ""\n \n def add_line(self, line):\n self.lines.append(line) \n \n def get_output(self):\n if self.lines[1].startswith(LTR) and self.lines[1].count("N") <= MAX_MISMATCHES:\n self.output = "".join(self.lines)\n return self.output\n else:\n return False\n \n\nwith open(fastq_file, \'r\') as fi, open(trimmed_file, \'w\') as fo:\n parser = ReadFastq()\n for n, line in enumerate(fi):\n if n % 4 == 0:\n if n != 0 and parser.get_output():\n fo.write(parser.get_output())\n parser.load(line)\n else:\n parser.add_line(line)\n')
# In[16]:
# I align the reads to the yeast genome with Bowtie
# I trim the first 10bp of the reads and then I require
# unique (-m 1) and perfect (-v 0) alignments.
mapped_file = os.path.splitext(sra_file)[0] + ".mapped"
get_ipython().system('$BOWTIE/bowtie -t $index_name -m 1 -v 0 -5 10 -p 2 -q $trimmed_file $mapped_file')
# ## Get rid of potential clonal reads
# In[17]:
unique_file = mapped_file + ".unique"
get_ipython().run_line_magic('timeit', '-n 1 -r 1 !cut -f2-5 $mapped_file | sort -u > $unique_file')
# In[18]:
get_ipython().system('head $unique_file')
# ## Extract the regions upstream and downstream the Ty1 insertion sites
# In[19]:
# Load the yeast genome in memory
dict_genome = {}
sequence = ""
with open(genome_file, 'r') as fi:
for line in fi:
if line.startswith(">"):
if sequence != "":
dict_genome[head] = sequence.upper()
head = line.split()[0][1:]
sequence = ""
continue
sequence += line.strip()
if sequence != "":
dict_genome[head] = sequence.upper()
# In[20]:
# Define a class to read the alignment file
class ParseAlignments:
def read(self, line):
self.line = line.strip().split("\t")
self.strand = self.line[0]
self.chromosome = self.line[1]
self.position = int(self.line[2])
# In[21]:
# Define a function to extract the genomic sequences
# upstream and downstream a given position.
def reverse_complement(sequence):
d = {"A":"T", "T":"A", "C":"G", "G":"C", "N":"N"}
sequence = sequence[::-1]
return "".join([d[i] for i in sequence])
def get_sequences(input_file, output_file, window=10):
with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
parser = ParseAlignments()
for n, line in enumerate(fi):
parser.read(line)
if parser.strand == "+":
position = parser.position + 3
else:
position = parser.position - 3
sequence = dict_genome[parser.chromosome][position - window: position + window]
if parser.strand == "-":
sequence = reverse_complement(sequence)
fo.write(">sequence_{}\n{}\n".format(n, sequence))
# In[22]:
# Extract sequences of 10bp and 1Kb
get_sequences(input_file=unique_file, output_file="sequences/fasta_10bp.fa")
get_sequences(input_file=unique_file, output_file="sequences/fasta_1kb.fa", window=1000)
# ## Seqlogos
# First we will check the presence of the insertion site pattern. We will do a seqlogo of the flanking regions of the Ty1 insertion sites.
# In[23]:
def do_seqlogo(input_fasta, output_seqlogo):
fin = open(input_fasta)
seqs = weblogolib.read_seq_data(fin)
data = weblogolib.LogoData.from_seqs(seqs)
options = weblogolib.LogoOptions()
options.yaxis_scale = 0.2
options.yaxis_tic_interval = 0.2
format_logo = weblogolib.LogoFormat(data, options)
fout = weblogolib.eps_formatter(data, format_logo)
with open(output_seqlogo, 'wb') as fo:
fo.write(fout)
# In[24]:
# Create the seqlogos
do_seqlogo("sequences/fasta_10bp.fa", "figures/fasta_10bp.eps")
#
# To verify that the above seqlogo is meaninful, we can compare it with a seqlogo obtained with 10000 random sequences.
# In[25]:
# Select 10000 random position in the yeast genome
N = 10000
genome_length = sum([len(i) for i in dict_genome.values()])
sample = random.sample(range(1, genome_length + 1), N)
# This function map a random number to a chromosome and position in the yeast genome
def get_coordinates(number):
for chromosome, sequence in dict_genome.items():
if (number - len(sequence)) > 0:
number -= len(sequence)
# Do not consider position that are too close to the start/end of the chromosome
elif number < 10 or number > len(sequence) - 10:
continue
else:
return (chromosome, number)
# Open the output file
random_file = "reads/random.mapped"
with open(random_file, "w") as fo:
for n in sample:
chromosome, position = get_coordinates(n)
fo.write("+\t{}\t{}\n".format(chromosome, position))
# In[26]:
# Extract sequences of 10bp and 1Kb
get_sequences(input_file=random_file, output_file="sequences/random_10bp.fa")
# Create the seqlogos
do_seqlogo("sequences/random_10bp.fa", "figures/random_10bp.eps")
#
# ## Ty1 insertions around tRNAs
# Second, we will verify that the Ty1 retrotransposons have a strong preference for inserting upstream tRNA genes.
# First of all lets look at the structure of the features file.
# In[27]:
# Create a dataframe with the yeast features
df = pd.read_csv(features_file, sep="\t", header=None)
# In[28]:
df.head(3)
# The **feature** column is the second. Lets look at which features are in the file
# In[29]:
np.unique(df[1])
# We will extract the the rows that contain the **tRNA** feature
# In[30]:
df_trna = df[df[1] == "tRNA"].copy()
trna = pd.DataFrame({"Chromosome": df_trna[8],
"Start": df_trna[9],
"End": df_trna[10],
"Strand": df_trna[11]})
# The chromosomes in the features file are named from 1 to 17 but the mapped file have different names.
# Here we create a dictionary that convert one format into the other.
# In[31]:
dict_chromosomes = {"NC_001133": "1", "NC_001134": "2", "NC_001135": "3",
"NC_001136": "4", "NC_001137": "5", "NC_001138": "6",
"NC_001139": "7", "NC_001140": "8", "NC_001141": "9",
"NC_001142": "10", "NC_001143": "11", "NC_001144": "12",
"NC_001145": "13", "NC_001146": "14", "NC_001147": "15",
"NC_001148": "16", "NC_001224": "17"}
# Lets look at the first lines of the dataframe:
# In[32]:
trna.head()
# The strand column uses **W (Whatson)** and **C (Crick)** to indicate the positive and the negative strands.
# Lets use **+** and **-** instead.
# In[33]:
trna['Strand'] = trna['Strand'].apply(lambda x: "+" if x == "W" else "-")
# In[34]:
# Look at the first lines of the dataframe
trna.head()
# Now we create a dataframe with the **Ty1** insertion positions.
# In[35]:
insertions = []
with open(unique_file, 'r') as fi:
parser = ParseAlignments()
for line in fi:
parser.read(line)
chromosome = dict_chromosomes[parser.chromosome.split("|")[1]]
insertions.append([chromosome, parser.position, parser.strand])
# We create a dataframe with the Ty1 insertion positions.
insertions = pd.DataFrame(insertions, columns=["Chromosome", "Position", "Strand"])
# In[36]:
# Look at the first lines of the dataframe
insertions.head()
# We calculate the distances of the insertions 1Kb upstream and downstream tRNA genes.
# In[37]:
def calculate_distances(feature, insertions, window=1000):
feature_grouped = feature.groupby(["Chromosome"])
insertions_grouped = insertions.groupby(["Chromosome"])
insertions_chromosomes = np.unique(insertions['Chromosome'])
distances_from_feature = []
for chromosome, feature_group in feature_grouped:
if chromosome not in insertions_chromosomes:
continue
insertions_group = insertions_grouped.get_group(chromosome)
for n, line in feature_group.iterrows():
start, end, strand = line[1:]
insertions_around = insertions_group[(insertions_group['Position'] >= start - window) &
(insertions_group['Position'] <= start + window)]
if len(insertions_around) == 0:
continue
# Calculate the distance of the insertions from the feature
distances = insertions_around['Position'].apply(lambda x: x - start)
# Correct for the strand of the feature
distances = distances.apply(lambda x: x * -1 if strand == "-" else x)
distances_from_feature.extend(distances)
return distances_from_feature
# In[38]:
distances = calculate_distances(trna, insertions)
# Represent with a histogram the distribution of the Ty1 insertions aroung tRNA TSS
# In[39]:
sns.set_context('poster')
def plot_distances(distances):
fig = sns.distplot(distances)
fig.set_xlim((-1000, 1000))
fig.set_xlabel("Distance from tRNA TSS")
fig.plot([0, 0], [0, fig.axis()[3]], 'k--', lw=2, alpha=0.5)
# In[40]:
plot_distances(distances)
# We got a peak of insertions just upstream the tRNA TSS, as expected. This is the position where
# the nucleosome is located.
# Also notice that there are other two smaller peaks that overlap with the positions of other nucleosomes.
# Now we can check if this happens with every TSS. So first we extract the rows of the features file
# that contain the **ORF** feature and are **not marked as Dubious**.
# In[41]:
df_orf = df[(df[1] == "ORF") & (df[2] != "Dubious")]
orf = pd.DataFrame({"Chromosome": df_orf[8],
"Start": df_orf[9],
"End": df_orf[10],
"Strand": df_orf[11]})
orf['Strand'] = orf['Strand'].apply(lambda x: "+" if x == "W" else "-")
# In[42]:
# Look at the first lines of the dataframe
orf.head()
# Calculate the distances of the insertions from the ORF TSS
# In[43]:
distances = calculate_distances(orf, insertions)
# Represent with a histogram the distribution of the Ty1 insertions aroung ORF TSS.
# In[44]:
plot_distances(distances)