Notebook

Import some functions from the python standard library, numpy, and ipython. This will set up our computing environment.

In [ ]:

# import some functions from python's random module - these will
# be used in the modeling process
from random import choice, random
# import some math functions from the numpy library (note that this
# isn't part of the python standard library)
from numpy import log10, average
# import FileLink from IPython - this will be used for downloading 
# our results
from IPython.display import FileLink

First, let's just experiment with the random function to learn about what it does.

In [ ]:

random()

Define a count_differences function, which counts the number of differences between a pair of sequences of the same length.

In [ ]:

def count_differences(sequence1,sequence2):
     """Count the number of differences between two sequences of the same length
     """
     # confirm that the two sequences are the same length and throw
     # an error if they aren't
     assert len(sequence1) == len(sequence2), "Sequences differ in length"
     # initiate a counter for the number of differences
     result = 0
     # iterate over the two sequences and count the number of
     # positions which are not identical
     for base1,base2 in zip(sequence1,sequence2):
         if base1 != base2:
             # this is a commonly used shortcut for incrementing a count
             # and is equivalent to the following statement
             # result = result + 1
             result += 1
     return result

Define an evolve_seq function. This takes a parent sequence, and returns two child sequences to simulate a possible speciation event, where each input sequence (representing a parent's genetic sequence) has diverged into two child sequences (representing the new species, and incurring point mutations along the way).

In [ ]:

def evolve_seq(sequence,
               substitution_probability=0.01,
               mutation_choices=['A','C','G','T']):
    """Return two child sequences simulating point mutations

       An error occurs with probability substitution_probability
        independently at each position of each child sequence.
    """
    # Generate two lists for storing the resulting sequences
    r1 = []
    r2 = []

    for base in sequence:
        if random() < substitution_probability:
            # a point mutation will occur at this position
            # what's wrong with the following statement?
            r1.append(choice(mutation_choices))
        else:
            # no point mutation at this position
            r1.append(base)
        if random() < substitution_probability:
            # a point mutation will occur at this position
            # what's wrong with the following statement?
            r2.append(choice(mutation_choices))
        else:
            # no point mutation at this position
            r2.append(base)
    # convert the lists to strings and return them
    return ''.join(r1), ''.join(r2)

Define a main function, which represents the main execution block of the code. This will be used to simulate evolution of sequences over a user-defined number of generations.

In [ ]:

def main(root_sequence,generations,verbose=False):
    # initial some values and perform some basic error checking
    assert generations > 0, "Must simulate one or more generations."
    # can you simplify the following test?
    for base in root_sequence:
        assert base != 'A' or base != 'C' or base != 'G' or base != 'T',\
         "Invalid base identified: %s. Only A, C, G, or T are allowed." % base
    # initialize a list of the previous generation's sequences - this gets used
    # in the for loop below. since we'll start with the first generation of
    # children, root_sequence is the previous generation's sequence
    previous_generation_sequences = [root_sequence]

    # iterate over each generation (why do we add one to generations?)
    for i in range(1,generations+1):
        # print the generation number and the current number of sequences
        print "Generation: %d (Number of child sequences: %d)" % (i,2**i)
        # create a list to store the current generation of sequences
        current_generation_sequences = []
        # create a list to store the differences in each current generation
        # sequence from the root sequence
        difference_counts = []
        # iterate over the sequences of the previous generation
        for parent_sequence in previous_generation_sequences:
            # evolve two child sequences
            r1, r2 = evolve_seq(parent_sequence)
            # count the differences in the first sequence (from root_sequence)
            r1_diffs = count_differences(root_sequence,r1)
            # append the count of differences to the list of difference counts
            difference_counts.append(r1_diffs)
            # add the new sequence to the list of this generation's sequences
            current_generation_sequences.append(r1)
            # count the differences in the second sequence (from root_sequence)
            r2_diffs = count_differences(root_sequence,r2)
            # append the count of differences to the list of difference counts
            difference_counts.append(r2_diffs)
            # add the new sequence to the list of this generation's sequences
            current_generation_sequences.append(r2)
            if verbose:
                # if the caller specified verbose output, print the actual sequences
                print "  %s %d" % (r1, r1_diffs)
                print "  %s %d" % (r2, r2_diffs)
        # print summary information: the average number of differences in the current
        # generation from root_sequence
        print "Mean differences %1.3f\n" % average(difference_counts)
        # current_generation_sequences becomes the next generation's
        # previous_generation_sequences
        previous_generation_sequences = current_generation_sequences

    # upon completion of all generations, return the last generation's sequences
    return previous_generation_sequences

Initialize some variables to run the analysis.

IMPORTANT: Update the value of the sid variable, or you will not be able to download your output.

In [ ]:

sequence_length = 100
num_generations = 5
sid = "your-student-id"

In [ ]:

assert num_generations <= 12, "Reduce num_generations if running on the class cluster. Max is 12."
assert sequence_length <= 100, "Reduce sequence length if running on the class cluster. Max is 100."
assert sid != "your-student-id", "Enter your student ID in the previous cell."
sequence_fp = "%s_sequences.fasta" % sid

# generate a random sequence composed of ['A', 'C', 'G', 'T']
# of length sequence_length
root_sequence = []
for i in range(int(sequence_length)):
    root_sequence.append(choice(list('ACGT')))
root_sequence = ''.join(root_sequence)

# run the simulation and get the final generation of sequences
sequences = main(root_sequence,int(num_generations))

# write the final generation of sequences to a fasta file
output_f = open(sequence_fp,'w')
for i,s in enumerate(sequences):
    output_f.write('>seq%d\n%s\n' % (i,s))
output_f.close()

# Print a link to download the sequence file
FileLink(sequence_fp)

In [ ]: