Import some functions from the python standard library, numpy, and ipython. This will set up our computing environment.
# import some functions from python's random module - these will
# be used in the modeling process
from random import choice, random
# import some math functions from the numpy library (note that this
# isn't part of the python standard library)
from numpy import log10, average
# import FileLink from IPython - this will be used for downloading
# our results
from IPython.display import FileLink
First, let's just experiment with the random
function to learn about what it does.
random()
Define a count_differences
function, which counts the number of differences between a pair of sequences of the same length.
def count_differences(sequence1,sequence2):
"""Count the number of differences between two sequences of the same length
"""
# confirm that the two sequences are the same length and throw
# an error if they aren't
assert len(sequence1) == len(sequence2), "Sequences differ in length"
# initiate a counter for the number of differences
result = 0
# iterate over the two sequences and count the number of
# positions which are not identical
for base1,base2 in zip(sequence1,sequence2):
if base1 != base2:
# this is a commonly used shortcut for incrementing a count
# and is equivalent to the following statement
# result = result + 1
result += 1
return result
Define an evolve_seq
function. This takes a parent sequence, and returns two child sequences to simulate a possible speciation event, where each input sequence (representing a parent's genetic sequence) has diverged into two child sequences (representing the new species, and incurring point mutations along the way).
def evolve_seq(sequence,
substitution_probability=0.01,
mutation_choices=['A','C','G','T']):
"""Return two child sequences simulating point mutations
An error occurs with probability substitution_probability
independently at each position of each child sequence.
"""
# Generate two lists for storing the resulting sequences
r1 = []
r2 = []
for base in sequence:
if random() < substitution_probability:
# a point mutation will occur at this position
# what's wrong with the following statement?
r1.append(choice(mutation_choices))
else:
# no point mutation at this position
r1.append(base)
if random() < substitution_probability:
# a point mutation will occur at this position
# what's wrong with the following statement?
r2.append(choice(mutation_choices))
else:
# no point mutation at this position
r2.append(base)
# convert the lists to strings and return them
return ''.join(r1), ''.join(r2)
Define a main
function, which represents the main execution block of the code. This will be used to simulate evolution of sequences over a user-defined number of generations.
def main(root_sequence,generations,verbose=False):
# initial some values and perform some basic error checking
assert generations > 0, "Must simulate one or more generations."
# can you simplify the following test?
for base in root_sequence:
assert base != 'A' or base != 'C' or base != 'G' or base != 'T',\
"Invalid base identified: %s. Only A, C, G, or T are allowed." % base
# initialize a list of the previous generation's sequences - this gets used
# in the for loop below. since we'll start with the first generation of
# children, root_sequence is the previous generation's sequence
previous_generation_sequences = [root_sequence]
# iterate over each generation (why do we add one to generations?)
for i in range(1,generations+1):
# print the generation number and the current number of sequences
print "Generation: %d (Number of child sequences: %d)" % (i,2**i)
# create a list to store the current generation of sequences
current_generation_sequences = []
# create a list to store the differences in each current generation
# sequence from the root sequence
difference_counts = []
# iterate over the sequences of the previous generation
for parent_sequence in previous_generation_sequences:
# evolve two child sequences
r1, r2 = evolve_seq(parent_sequence)
# count the differences in the first sequence (from root_sequence)
r1_diffs = count_differences(root_sequence,r1)
# append the count of differences to the list of difference counts
difference_counts.append(r1_diffs)
# add the new sequence to the list of this generation's sequences
current_generation_sequences.append(r1)
# count the differences in the second sequence (from root_sequence)
r2_diffs = count_differences(root_sequence,r2)
# append the count of differences to the list of difference counts
difference_counts.append(r2_diffs)
# add the new sequence to the list of this generation's sequences
current_generation_sequences.append(r2)
if verbose:
# if the caller specified verbose output, print the actual sequences
print " %s %d" % (r1, r1_diffs)
print " %s %d" % (r2, r2_diffs)
# print summary information: the average number of differences in the current
# generation from root_sequence
print "Mean differences %1.3f\n" % average(difference_counts)
# current_generation_sequences becomes the next generation's
# previous_generation_sequences
previous_generation_sequences = current_generation_sequences
# upon completion of all generations, return the last generation's sequences
return previous_generation_sequences
Initialize some variables to run the analysis.
IMPORTANT: Update the value of the sid
variable, or you will not be able to download your output.
sequence_length = 100
num_generations = 5
sid = "your-student-id"
assert num_generations <= 12, "Reduce num_generations if running on the class cluster. Max is 12."
assert sequence_length <= 100, "Reduce sequence length if running on the class cluster. Max is 100."
assert sid != "your-student-id", "Enter your student ID in the previous cell."
sequence_fp = "%s_sequences.fasta" % sid
# generate a random sequence composed of ['A', 'C', 'G', 'T']
# of length sequence_length
root_sequence = []
for i in range(int(sequence_length)):
root_sequence.append(choice(list('ACGT')))
root_sequence = ''.join(root_sequence)
# run the simulation and get the final generation of sequences
sequences = main(root_sequence,int(num_generations))
# write the final generation of sequences to a fasta file
output_f = open(sequence_fp,'w')
for i,s in enumerate(sequences):
output_f.write('>seq%d\n%s\n' % (i,s))
output_f.close()
# Print a link to download the sequence file
FileLink(sequence_fp)