# import some functions from python's random module - these will # be used in the modeling process from random import choice, random # import some math functions from the numpy library (note that this # isn't part of the python standard library) from numpy import log10, average # import FileLink from IPython - this will be used for downloading # our results from IPython.display import FileLink random() def count_differences(sequence1,sequence2): """Count the number of differences between two sequences of the same length """ # confirm that the two sequences are the same length and throw # an error if they aren't assert len(sequence1) == len(sequence2), "Sequences differ in length" # initiate a counter for the number of differences result = 0 # iterate over the two sequences and count the number of # positions which are not identical for base1,base2 in zip(sequence1,sequence2): if base1 != base2: # this is a commonly used shortcut for incrementing a count # and is equivalent to the following statement # result = result + 1 result += 1 return result def evolve_seq(sequence, substitution_probability=0.01, mutation_choices=['A','C','G','T']): """Return two child sequences simulating point mutations An error occurs with probability substitution_probability independently at each position of each child sequence. """ # Generate two lists for storing the resulting sequences r1 = [] r2 = [] for base in sequence: if random() < substitution_probability: # a point mutation will occur at this position # what's wrong with the following statement? r1.append(choice(mutation_choices)) else: # no point mutation at this position r1.append(base) if random() < substitution_probability: # a point mutation will occur at this position # what's wrong with the following statement? r2.append(choice(mutation_choices)) else: # no point mutation at this position r2.append(base) # convert the lists to strings and return them return ''.join(r1), ''.join(r2) def main(root_sequence,generations,verbose=False): # initial some values and perform some basic error checking assert generations > 0, "Must simulate one or more generations." # can you simplify the following test? for base in root_sequence: assert base != 'A' or base != 'C' or base != 'G' or base != 'T',\ "Invalid base identified: %s. Only A, C, G, or T are allowed." % base # initialize a list of the previous generation's sequences - this gets used # in the for loop below. since we'll start with the first generation of # children, root_sequence is the previous generation's sequence previous_generation_sequences = [root_sequence] # iterate over each generation (why do we add one to generations?) for i in range(1,generations+1): # print the generation number and the current number of sequences print "Generation: %d (Number of child sequences: %d)" % (i,2**i) # create a list to store the current generation of sequences current_generation_sequences = [] # create a list to store the differences in each current generation # sequence from the root sequence difference_counts = [] # iterate over the sequences of the previous generation for parent_sequence in previous_generation_sequences: # evolve two child sequences r1, r2 = evolve_seq(parent_sequence) # count the differences in the first sequence (from root_sequence) r1_diffs = count_differences(root_sequence,r1) # append the count of differences to the list of difference counts difference_counts.append(r1_diffs) # add the new sequence to the list of this generation's sequences current_generation_sequences.append(r1) # count the differences in the second sequence (from root_sequence) r2_diffs = count_differences(root_sequence,r2) # append the count of differences to the list of difference counts difference_counts.append(r2_diffs) # add the new sequence to the list of this generation's sequences current_generation_sequences.append(r2) if verbose: # if the caller specified verbose output, print the actual sequences print " %s %d" % (r1, r1_diffs) print " %s %d" % (r2, r2_diffs) # print summary information: the average number of differences in the current # generation from root_sequence print "Mean differences %1.3f\n" % average(difference_counts) # current_generation_sequences becomes the next generation's # previous_generation_sequences previous_generation_sequences = current_generation_sequences # upon completion of all generations, return the last generation's sequences return previous_generation_sequences sequence_length = 100 num_generations = 5 sid = "your-student-id" assert num_generations <= 12, "Reduce num_generations if running on the class cluster. Max is 12." assert sequence_length <= 100, "Reduce sequence length if running on the class cluster. Max is 100." assert sid != "your-student-id", "Enter your student ID in the previous cell." sequence_fp = "%s_sequences.fasta" % sid # generate a random sequence composed of ['A', 'C', 'G', 'T'] # of length sequence_length root_sequence = [] for i in range(int(sequence_length)): root_sequence.append(choice(list('ACGT'))) root_sequence = ''.join(root_sequence) # run the simulation and get the final generation of sequences sequences = main(root_sequence,int(num_generations)) # write the final generation of sequences to a fasta file output_f = open(sequence_fp,'w') for i,s in enumerate(sequences): output_f.write('>seq%d\n%s\n' % (i,s)) output_f.close() # Print a link to download the sequence file FileLink(sequence_fp)