Here I'm posting my solutions to the problems from this course. I started this file on January 13, 2014.
Find the most frequent k-mers in a string.
Input: A string Text and an integer k.
Output: All most frequent k-mers in Text.
genome = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4
def genomekmers(genome, k):
kmers = {}
freqs = {}
positions = {}
for i in range(len(genome)-k):
kmers[genome[i:i+k]] = kmers.get(genome[i:i+k], 0) + 1
positions[genome[i:i+k]] = positions.get(genome[i:i+k], []) + [i]
f = kmers[genome[i:i+k]]
if f>1:
freqs[f-1].remove(genome[i:i+k])
freqs[f] = freqs.get(f,[])+[genome[i:i+k]]
return kmers, freqs, positions
def topkmers(genome, k):
kmers, freqs, positions = genomekmers(genome,k)
output = freqs[max(freqs.keys())]
print " ".join(output)
return output
genome = "TCGAACATGAAACTTTTGAAACTTTTGAAACTTTTCGAACATATTCTTGATAAATTTCTTGAAACTTTTGAAACTTTTCGAACATCGAACATGAAACTTTTATTCTTGATATTCTTGATGAAACTTTTAAATTTCTATTACTATCTAAATTTCTTATTCTTGAATTACTATCATTACTATCTATTCTTGATAAATTTCTTATTCTTGATAAATTTCTTAAATTTCTTATTCTTGATATTCTTGATCGAACATCGAACATGAAACTTTTGAAACTTTTATTCTTGATATTCTTGATCGAACATATTCTTGATAAATTTCTTAAATTTCTTAAATTTCTTAAATTTCTTAAATTTCTATTACTATCTAAATTTCTTAAATTTCTTAAATTTCTTATTCTTGATATTCTTGATAAATTTCTTGAAACTTTTCGAACATAAATTTCTTCGAACATAAATTTCTTGAAACTTTATTACTATCATTACTATCTGAAACTTTATTACTATCTATTCTTGATAAATTTCTTGAAACTTTATTACTATCTATTCTTGAATTACTATCTGAAACTTTATTACTATCTAAATTTCTTAAATTTCTTCGAACATGAAACTTTTAAATTTCTTAAATTTCTTATTCTTGATAAATTTCTTATTCTTGATCGAACATCGAACAATTACTATCTAAATTTCTATTACTATCTAAATTTCTTAAATTTCTTATTCTTGATAAATTTCTTATTCTTGATCGAACATATTCTTGATCGAACATGAAACTTTTAAATTTCTTATTCTTGATAAATTTCT"
k = 11
t = topkmers(genome,k)
TAAATTTCTTA
Reverse complement a nucleotide pattern.
Input: A DNA string Pattern.
Output: the reverse complement of Pattern.
def reversecomplement(genome):
function = {"A":"T", "T":"A", "C":"G", "G":"C"}
output = [function[x] for x in genome[::-1]]
return "".join(output)
reversecomplement("CGTTTCGGCG")
'CGCCGAAACG'
Find all occurrences of a pattern in a string.
Input: Two strings, Pattern and Genome.
Output: All starting positions where Pattern appears as a substring of Genome.
def patternmatching(pattern, genome):
kmers, freqs, positions = genomekmers(genome,len(pattern))
output = positions[pattern]
print " ".join('{}'.format(i) for i in output)
return output
p = patternmatching("ATA", "CGATATATCCATAG")
2 4 10
Return all starting positions (in increasing order) where CTTGATCAT appears as a substring in the Vibrio cholerae genome.
file = "/Users/javier/Documents/Data/stepic/vibrio_cholerae.txt"
with open(file, "r") as myfile:
vibcholegenome = myfile.read()
pattern = "CTTGATCAT"
def positionskmers(genome, k):
positions = {}
for i in range(len(genome)-k):
positions[genome[i:i+k]] = positions.get(genome[i:i+k], []) + [i]
return positions
positions = positionskmers(vibcholegenome, 9)
positions[pattern]
[60039, 98409, 129189, 152283, 152354, 152411, 163207, 197028, 200160, 357976, 376771, 392723, 532935, 600085, 622755, 1065555]
Find patterns forming clumps in a string.
Input: A string Genome, and integers k, L, and t.
Output: All distinct k-mers forming (L, t)-clumps in Genome.
def isthereaclump(l, k, t, L):
for i in range(len(l)-(t-1)):
if l[i+(t-1)] + (k-1) - l[i] <= L-1:
return True
return False
def clumpfinder(genome, k, t, L):
pos = positionskmers(genome, k)
clumps = {key:value for key,value in pos.items() if len(value)>=t and isthereaclump(value, k, t, L)}
return clumps.keys()
genome = "GCGGTTATGCACCGTTCAAATTAGCAAACCACTAAGCGACGTAGTCTGGATTGATTTCTCCCTACCAGTGACCCAAGACGCGTTAGTGAGTTAAGTTCATATCCAGTACCTGCCGCCCTCTGTACTTGGGCGTCCGATTCGCATGCTTACTCAGGTGGAGGACACGATAATCTGATTAAACTGAGCTAAACCAGGTGGAACCAGAAACCAGGTGGGGAGTCTCGCTTCAAGCCGTTCTTGCGATCAAACCAGGTGGTCCATTATGAAACCAGGTGGCTAAACCAGGTGGTCCAGATCCTCGAATGATGTCGGTGCACATCAAAACCAGGTGGGGTGGTGGAACGTAAAACCAGGTGGCATAAACCAGGTGGGCCGGTTCGTAAACCAGGTGAAACCAGGTGGGGTGGAAACCAGGTGGGTTACAAATTACGTTGAGATGGCCCAAACCAGGTGGTGGGCTTCACCCATGTCAACAAACCACCCTATGGAACTAAACCAGGTGGAACCAGGTGGTGAAGGCTTATCCTCAGGAAAAACCAGGTGGAGGTGGTGAAATAAAACCAGGTGGACCAGGTGGATAACCCTCGCCTCGCTTCTCAACCGAGACCTGGATAAACCAGGTGGGGTGGTCCACCGATTTTTGAGACACTAGAAACCAGGTGGGCGGGGAAACCAGGTGGCAAACCAGGTGGGGTGGACGGAAACCAGGTGGATATGTCATAAAACCAAACCAGGTGGTGCACCCCCATGGTGTGTCTTATCCGTGCGTATAAACCAGGTGGTCGCACGGCTTCCACTTGCTGAGAATAGGCCCGCAGGGTCAGTGCCATGCCCTCCGTCACTCGATATGTGTTGTAAGAGTGGTTACCCCTTCATTGAAGTCGCCCACAGCCCCACCTGCATTGCTAGACTATCACCCTACAGTAGGCCTTTTCGCCTTCTTCAAGCAGCAATCTCTTATCCGCGGATGGGCGCGGCGAGCGTGGCGTCCCCGAACATTTTTACCTAACGTGTTTTGTTGGCCGCAAGCCTTCCCTCTAGTCCACCTCAGCCATTCAGCCTAGTAGCTTTCAAGCCGAGCCTTCCATATCTAATGGACCGTCCAGAATTTCACACGTTTCACAGGGCTGTGTTCGACCGCCCGTAATGCTGTTTCACAGGCGATCGCCTTGCGGTTTTTTCACAGATCGCAGCCGATGGACATGCCAACTCGATTTTCACAGAGTTTTTCACAGCGGTTTCACAGCACAGCAGTGATTGTTTCACAGCAATTTTCACTTTCACAGGGGCCCTTTTCACAGCTCAGGGCTCTTTTCACTTTCACAGTTTCACAGCGCTCCTTTCACAGAGCGGGGAAATTTAAGGGAACACTCAAGGGAACAAGGGAACACACAAAGGGAACACAACACAACACATAAGGGAACACTTTCACAGAACACAAAAGTCCGAAATCATCAGCGGCGAAGGGATTTCACAGACAGACACTTTCACAGCGCATTTCACAGATACGTACTTTCACAGGCGTACTTTCACAGACTTTCACAGAGGACAAGCTCAATTTTCACAGACAGGCTGGATAAATTTCACAGCGGTAAGGGTTTCACAGCACACATAAGGGAACACGAATTTCACAGCAGGGAACACCTCTACGAGTAATCTATTACTCTACCTACTGAAGGGAACACACCGAAGACCTACTATTACCTATTACTCTTAAAGGGAACACATTACAAGGGAACACACTCTCTCGTCATATCTCACCTCTCTATTACTCTTAAGGGAACACCTTCTCGATCAACCTATTACTCTATGGAGATAGAGATATTCCAGACATATGGAGATAACATGGAGATATGGAGATAATGGAGATGGAGATAGCTCTTATATTTATCCTATGGAGATATGATACTATTAATGGAGATAATTCTAATGGAGATATAATTACTCTAAGAGGATGGGATCTCGGGCTATTACTCTAATGGAGATAAGCACTATTACTCTAGGAAATGGAGATATGTCAATGGAGATATGTAATGGAGATAGAGGGAGATGGAGTCGCCATTTCATAATCGCCATTTCATAGTTCAGGAATCGCCATTTCCGCCATTTCTAAGATGGAGTCGCCATTTCTACGTATGGAGATAGGATCGCCATTTCATACGACCCGTTGGATATCGCCATTTCCTCGCCATTTCTGGTGACATTTCTCGCCATTTCATTTCTGGAGATAGATGGATCTCGCCATTTCATAGGAATCGCCATTTCCACGTAGGGGGGGCCACAATCCGTAGGTCGGAATTCAGACTCGCCATTTCCCATCGCCATTTCTTCACCTGTATGCCGATCCCTTCGCCATTTCTCATGGAGATAACTCTCTCTCGCCATTTCTCGCCATTTCCATTTCACTCTCATTCGCCATCGCCATTTCCATTCGCCATTTCATCGCCATTTCTTCAGGATAAGATATCGCCATTTCGACTCTCATTCGCATACTGACTCTCATTCTCATCTCGCCATTTCTCATCTGACTCTCATCCTGGGGGAAACTTGCGACTCTCATCACACTTCCGTCGACTCTCATACTGGCGGATAGCATAGGAGCCATTTAAAGACTCTCATTCTCATTCGAGACTCTCATTCAAATCCTACGAGGACTCTCATATAGACTCTCATATCATTACGAGGACTCTCATATACGAGCCATGCATGTGGCGACGACTCTCATCTACGAGCCATGCAAGCAGAATCTACGAGCGACTCTCATTACGAGCCATGTGACCGTACGAGCCATGCATGCATGCCATGCTGACTCTCATCGAGTACGAGCCATGGAAGTTCTTGTTGGTTCGTAGCCCAAGAGCTGAAGTTACGAGCCTACGAGCCATGAAGTTACTTTTACGAGCCATGAAGCTTACGATACGAGCCATGCGAGCCATGCATCCGCGCTACGAGCCATGTTCCAGTACGAGCCATGTTAGTTGCTGAAGTTAAGTTTGGCGCTGAAGTTTGTACGAGCCATGTGCCCGCTGAAGTTTGTTGTACGAGCCATGCATGCTGAAGTTAATGGCTGAAGTTAGCGTTTGCGGGCAGATCCTCATTCTACGATACGAGCCATGCCATGCAGCTGAAGTTAAGTTGGGTTACGAGCCATGCGAGCCATGTGAAGTACGAGCCATGCTGGCTGAAGTTGTTTGTGCTGCTGAAGTTGCTCTTGTCTCTAGCTGAAGTTGCCAACAGGGCTGAAGCTGAAGTTTAAGCTGAAGTTGCGAGCAGGCTGAAGTTATCGGATTGGGGCTGAAGTTCAACCTCCCGTCCCCCCACACTATATTCCCGTCCCCCCCCGCGCACGCGCCGTCTCCCGTCCCCCCTATCCCGTGCGCACGCGACGCGATCCCGTCCCCCCAGAGTGCGCGCACGCGTCCCCCTTCCCGTCCCCCTCTCCCGGGCGCACGCGTCGCTCAACATTTCCGCGCACGCGTCGCGCACGCGGGCGCACGCGGGTCCCGTCCCCCCCCCTCTTCGGCGCACGCGGAATTCCCGTCGCGCACGCGTCCCGTCCCGCGCACGCGTCGCGCACGCGACTGCCCTAACCAACAGTGCGCACGCGCCGGTAACCCGGTAACCCGGTAACCGCGCACGCGGGCGCACGCGCGTAACCCGCGCACGCGCCGCGCACGCGGCCCGGTTCCCGTCCCCCCCGGTAACCCGGTAACTCCCGTCCCCCGTAACCCGGTGCGCACGCGCCCGGCGCACGCGGAGCGCACGCGCCCCCCCCGGTAATAGCGCACGCGCCCGGGCGCACGCGCCCGGTAACCCGGTAACCCGGGCGCGCGCACGCGGCGGCGCACGCGGCGCACGCGGCGCACGCG"
k = 11
L = 566
t = 18
clumpfinder(genome,k, t, L)
['AAACCAGGTGG']
file = "/Users/javier/Documents/Data/stepic/e-coli.txt"
with open(file, "r") as myfile:
ecoligenome = myfile.read()
k = 9
L = 500
t = 3
clumps = clumpfinder(ecoligenome, k, t, L)
len(clumps)
1904
We define Skew_i(Genome) as the difference between the total number of occurrences of G and C in the first i nucleotides of Genome.
def skew(genome):
output = [0]
count = 0
for x in genome:
if x == "G":
count = count + 1
if x == "C":
count = count - 1
output = output + [count]
return output
skew("CATGGGCATCGGCCATACGCC")
[0, -1, -1, -1, 0, 1, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0, -1, 0, -1, -2]
Find a position in a genome minimizing the skew.
Input: A DNA string Genome.
Output: All integer(s) i minimizing Skewi (Genome) among all values of i (from 0 to |Genome|).
genome = "TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT"
def minskew(genome):
t = skew(genome)
l = [i for i,n in enumerate(t) if n == min(t)]
return l
minskew(genome)
[11, 24]
Find all approximate occurrences of a pattern in a string.
Input: Two strings Pattern and Text along with an integer d.
Output: All positions where Pattern appears in Text with at most d mismatches.
def appmatchpat(p, q, d):
count = 0
for x, y in zip(p,q):
if x != y:
count = count + 1
if count > d:
return False
return True
appmatchpat("CAACCAA", "ADAAAAA", 3)
False
def appmatch(pattern, genome, d):
pos = []
k = len(pattern)
l = len(genome)
for i in range(l-k):
if appmatchpat(pattern, genome[i:i+k], d):
pos = pos + [i]
return pos
pattern = "ATTCTGGA"
genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT"
d = 3
appmatch(pattern, genome, d)
[6, 7, 26, 27]
for d in [1,2,3,4,5]:
print "Para", d, " hay: ", len(appmatch("AAAAA", "AACAAGCTGATAAACATTTAAAGAG", d))
Para 1 hay: 4 Para 2 hay: 10 Para 3 hay: 16 Para 4 hay: 20 Para 5 hay: 20
Find the most frequent k-mers with mismatches in a string.
Input: A string Text as well as integers k and d. (You may assume k ≤ 12 and d ≤ 3.)
Output: All most frequent k-mers with up to d mismatches in Text.
(Note that such k-mers do not need to actually appear as substrings of Text.)
import itertools
def ball_edit_distance_d(pattern, d):
output = []
indexes = range(len(pattern))
for i in itertools.combinations(indexes,d):
for j in itertools.product("ACGT", repeat=d):
s = list(pattern)
for k in range(d):
s[i[k]] = j[k]
output = output + ["".join(s)]
# print i, j, "resulta:", "".join(s)
# This generating function is not very efficient. In particular it over generates several patterns.
# That's the reason for the "set" in the next line.
return list(set(output))
def countaproxkmers(genome, k, d):
kmers = {}
balls = {}
for i in range(len(genome)-k):
# print i, "out of", len(genome)-k, "-",
l = balls.get(genome[i:i+k], ball_edit_distance_d(genome[i:i+k],d))
for kmer in l:
kmers[kmer] = kmers.get(kmer,0)+1
return kmers
def mostfrequentapproxkmers(genome,k,d):
# print "Starting counting"
kmers = countaproxkmers(genome, k, d)
# print "Done counting"
top = max(kmers.values())
m = [k for k,v in kmers.items() if v==top]
print "Result:"
for i in m:
print i,
return m
genome = "TACAATGAAGATGAGAAGAATGTACAAGAATGATGTCAATGTACATACAAGAAAGAGAATGAGATCAATGTACAAAGAGAAAGTCAATGATGTACATACAAGAAGAATGATGTCAAAGAAGAGAATGTCATACAAAGTCAAAGAGATACAAAGTCAAGAATGTACAAAGTACAAAGAGAAAGTCAAAGATGTCATCAAGAATGAAGATGAGAAAGTCATACAATGTCAAGATACAATGATGAGAATGTACAAAGAAGATGAAGTACAAAGATGTCAATGAAGTCAATGTACATCATCAAGAAGAATGAGAATGTCAAGATCAATGAGAAAGTCA"
k = 10
d = 2
kmers = mostfrequentapproxkmers(genome,k,d)
Result: AGAATGTCAA
Find the most frequent k-mers (with mismatches and reverse complements) in a DNA string.
Input: A DNA string Text as well as integers k and d.
Output: All k-mers Pattern maximizing the sum Count_d(Text, Pattern) + Count_d(Text, Pattern) over all possible k-mers.
def countaproxkmersandreversecomp(genome, k, d):
kmers = {}
balls = {}
for i in range(len(genome)-k):
# print i, "out of", len(genome)-k, "-",
g = genome[i:i+k]
l1 = balls.get(g, ball_edit_distance_d(g,d))
h = reversecomplement(g)
l2 = balls.get(h, ball_edit_distance_d(h,d))
for kmer in l1+l2:
kmers[kmer] = kmers.get(kmer,0)+1
return kmers
def mostfrequentapproxkmersandreverse(genome,k,d):
# print "Starting counting"
kmers = countaproxkmersandreversecomp(genome, k, d)
# print "Done counting"
top = max(kmers.values())
m = [k for k,v in kmers.items() if v==top]
print "Result:"
for i in m:
print i,
return m
genome = "GGGGGGTCAGGTAAAGGGGGGGAAGGTCATCATAGGGAAAAGGGTCAGGTCATAGGTATAGGTATAAAGGAAGGGTAAATCATAGGAATCATCATAGGGGGTCATCATAGGGGAAAAAATCAGGGGGGGGGGGGGGGGGTAAATCAGGTATCATATCATCATAGGTAGGTCAGGGGAAGGGTCAAATCAGGGGGGTCAGGGAAGGGAAAATATATCATA"
k = 9
d = 2
kmers = mostfrequentapproxkmersandreverse(genome,k,d)
Result: GGGGGGGAA GGGAAGGGG GGGGGGAGA GGGGGGGGG CCCCCCCCC TCTCCCCCC CCCCTTCCC TTCCCCCCC
Translate an RNA string into an amino acid string.
Input: An RNA string Pattern.
Output: The translation of Pattern into an amino acid string Peptide.
Here is the RNA Codon Table.
# Read the table
import urllib2
url = "https://stepic.org/media/attachments/lessons/96/RNA_codon_table_1.txt"
table_file= urllib2.urlopen(url)
codon_table = {}
for line in table_file:
l = line.split()
if len(l) != 1:
codon_table[l[0]] = l[1]
else:
codon_table[l[0]] = ""
print codon_table["UAA"] == "", "and", codon_table["AAA"] == "K"
True and True
def t_to_u(n):
if n == "T":
return "U"
else:
return n
def u_to_t(n):
if n == "U":
return "T"
else:
return n
def DNAtoRNA(genome):
g = list(genome)
return "".join(map(t_to_u,g))
def RNAtoDNA(genome):
g = list(genome)
return "".join(map(u_to_t,g))
DNAtoRNA("AGTCA")
'AGUCA'
def protein_translation(genome):
g = DNAtoRNA(genome)
splitted_g = [g[0+i:3+i] for i in range(0, len(g), 3)]
if len(splitted_g[-1]) != 3:
splitted_g = splitted_g[:-1]
trans = [codon_table[c] for c in splitted_g]
return "".join(trans)
protein_translation("AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA") == "MAMAPRTEINSTRING"
True
g = "AUGGCC"
s = [g[0+i:3+i] for i in range(0, len(g), 3)]
print s
print s[-1]
print len(s[-1:])
d = [codon_table[c] for c in s]
"".join(d)
protein_translation("AUGGCC")
['AUG', 'GCC'] GCC 1
'MA'
genome = "AUGUCCAGUAACGUGUCUGUAGUCGUCACUCAGUCUAACAACCGACUUCAUACACUGAACAGUCGACACGAGCCAAGGGCUUCCAAACUGUUUCUUGAUAUAGUUCAUUUGUCAACUGUCGAUCUAUUCGUUCUCGUGCGGUACCUAACAGGUAGAGAGUCCUCCAUUUCUCUCGCGCGAGCAGUGAGACUUAUUAGAAGCCCGUGGAGGGCAGAAGUCAGUGCUCUGUGUGAUGUCAUAUGUAAAGAGCCCCCUAGUGAAGCAGUGUCGGAGUCAGUGCAUGACACAAGUCUAUGUGGCCCGGAGUCGACACGCAGCCGGAGAAAGGCGGGGCGCAACACCGAUGCACGUGAGCUGAAUAGAAUUGAAUUUCUCCGGGUAAUAACAAGGCAGCUAGGUGGAGUGAAACGGGCUUCGGGCGUCUGCCACUUAACGCGAACUAUGCCGGGAUCGUGCAAACUCUUCAAUGCACCCAAGAUUAUGUCCCGGGUGAAUUUUACUUGGCAGACUCCGGACUGUUUGCCUGGAGGCCCCUUAUUGACUUCGCUUCUAGACGGCACGGUACAAAGAUCUCGCAACUCAGGCGUCGUCUUGCGUAUGGCAAAAAGUCGCGUCGAGGCUGUAUGGGCUAAGACCGAUCUGAACCAAUUUAAAGCAAUAGUGAGAGCGACCACGUCUCUGCUUGGGGUCCCAAAAUGGAGUUCUAUAUCCCGUUGCGAUAGAACCUCUGCCCGGCUCAUAUCAUCUAUUCCAUCGAUCGCAGUUACUACAAUCACGCUACACGAUGACGGGAUAACCUUUUCUACACCCCCCGUCUUACCAAUCUACUCUGGUGACAUAAAAGGUUUACGACGCCGGGGGCAGCCACGCUUAACUGCCCUUCCAUCUAAAAACUAUAGUCAUCCUUGUCCUGUCGCGCCUAAUAUCGUGGUACCCGUGCUAAUGGGUAGUCUUGCCGGUUCUCGACUCAGGAUUAAUACCCCCAUUCACGUAGCAUGUCUUCGUUCUGCCGAAGGCUUGAGCAUGAACAGUCUGGUUAUACUAGUCGAUGUAUGCACAGUGAGGACUCCUGCAGGUAUACUCAGAGGUUUAAAAUACACACGCGACCCACGGUUUGAGUCAGCAGGGUCAUCCAUGAACUUGACGAAAUGUUCGAUCAUGAUGGGUCCCAUCAGGUGGGACGUCAAUAUUUCUCUCGGCGAUACGAGUCGUGCCUUGUACGCAUGUCCCACUGGUCACUUCGCGAGGGCGGUGAAACACUGUAAGCACUAUGGUCGGUUGUGUCGCUCUGCACGAACGCACGGAGAUGUUCCACCAUCGAACCCCCGUAUUGACCUUGUUAUGAGUAGGCUUAUUCGCAGGGAGGGAUAUCGAGGCAUCGGAUUAGAUUGGCCCAGGAUCCCCGUCGAGGAUUUGCACAUCUCCGUCCGGCUACCCGUGGACAGCCCCCAGCCGUUCUUCACGUUACUUAAACAUAAGCCCGCACGCUCCCACGUUUGUCAACGCCGCACUCGCACUGUCGGGCGGACAACUGACGAAACGCUCUGCUGUAAAAUCGGGUGCCAUAUUAGGAGGGCAGACUCUAAUACCCGCAUACUGCAGCCUGCUGACGCCCGUCGAACGCUAGCGGCAGUCCUAUGCCUUGCGGGAGAUCUAAUCAUACAGUCCGUUCGCGUAAGGGGUAGCCUAUUCAAGCAAUCUUAUGCAAUAAAAGUACCUUCCUCGAAGAGCGCCUUUCGGCCAGAGACUCGUUUUCGCUUUGGGCUGGCAAACAUACAUGAGGACCGGUUAGAGGAACUUGAACAUCAAGACAAUUGCCCUUGGCGAUGCGGCGGACUCUCUAUUAGCCGCAUGGAGCUUGCAAUAAAUGAUAUGCAGGGUUCGCUUAGCCUGGAACUUGUCCAUAUAUAUGCAGACAUGUUCAGCUUUUAUUCUAACGCUGGACUCAUUACAUCUAAAGCCGGCGGUCGCCAGAUAGCUGACCACUUGGAUGUAUUUCUCUUGACGACUAACGACGGUCAUUAUUGUAGGAUGGGCGAAGCAAGUGCUGAUGAGAGCAGCUCAAGGCAGCAGGGUGAAACUCUAGAGCUCGGGAUGCGACGGAGCCGUAUAUUACGUGUUGUGGUGGUAAAGGUGUACCCUAGCCUCCGACUGACUGAUGAUGGUCCAUUUCGCCAGACUCAUGACUCUCGACGUAUAACGGAAUUUAGGUCCACUAUGAAAGGCCUUGGACUACGUGCUAUAGGGUGGGCCCCUAGGACUUGUACACCUACGCUAAUGAUGCUCCGCUCCAGCACCUUGUCCCGUUCGGCGCAAAGUAACCCCUGUUUAGUUAUCCGGGUCGAUCACGAGACCCUGGCCCAUGUAGAUUCCCGUGGGGCCAGUGGCGAGAGCAAUGAGCAAGCUCUUCGUAAUAACGGACCUCAGGCACCAUCAGUAACGGGAGUAGCCACAGUCCCGGGAGUUGGUUACGUACUUAACCGUAGCCACCUUAAGAGAGUAGAGGGCUAUGUAUGCGGGGAGCUCACUUCGCUAUGUGUAGGACCUAGCUUGGCUACCCGAAUCGUACUAGGUACUUUGGCUAUUUGUCUUUCGUAUCUCCCUCAUGGAACGUGGGUGCGUUCGGCAACUUUAGGAGUGCACUACGGGCACGGUGCGGAGACGGUACGAUCACGAUCCAAUCACAUACGCCACGGCAUUUUAAUGGCAGAUAGUAGCUCGAGACCAGGCCAACUCGGUAUCUACUCUGGCUACUUUAGAGAACGACGAAUCCCAUUGAGGGCGAAUCGCAACCGUUCUCUACUACUCUACGUUUCUAGUAUUCUCCUAGCUCAGCACGUACAUCGCCGGGGAAAUUGCGCAACCGUGGAAGGCAGCAACCCGCUGAGCUACAGGAGUUCGCCGACUCGAUGCUGUAUAAUAGGCAUGACGACGUGCUCUUUCUCAAUAUUACUACCGGCCAAUCGACUGUAUGGUGUUAGAGGUACUGCGGUAACGUUUAAUGCCGUUGGGCCGGACACUAGCCCGACACCUUCGAGUAGCGCCCGGUUAAAACACGAACUUCAGUCCCCCCAUUCCACAUGCGCUAGCCGCCAAUCGUGCUGCUCUUUAGCGAAAGCAGUUACGUGGCAAGAAAUCCAGGACAUAAAUGGCGGUUGUAAUUAUUGUCAUUUAAAAGGGUGCCCAGCCGCACGGACCAAGCCGAGCGCGCGAUGCCGCUUCCGUACAAACAUCUAUCCUUCGGCAGGCGUACUGAGCCUUAGAACAUCGACAUUUGCGUCACCUAAUCACUUUCAGAUAAUAAUAAGACGCCUGCCUGUACUCUUGUUAGUGUCAGCACAGCGGAACAUCAUCUUAUCCUGGGGCUGUCGGCUAUGUCGACGCGCUCAGAGUUCGAACAAGAAUUUUGGGCCCGCGAUCCCAUGCUGUGGUCCUAUUAGUGGAAAAGGAACGGAUAAAAUUUUAAUUAUACUAAAGAUGGAAAUCCCAUUAACACUGCCUUUAGUGGGCGCCUAUGUUGAAAUUGUAUCACUUCGUGUACCUACCAAAGUAGCCAAAGCUUUGAAUCGUGGUAAAAAAUGCCAUAUUACAGUGUGUCACAGCAAGAACAGGCGGUUCGCGACGGUAACCCCAGUCCUCUCAAUAACUGUUUCGGUUCUAUCUACGGAGGGAGUCACCCGCAAGAACCCUAGGACUAUUACUGAGAAGGUGGGCGUGAGAGAGAUAUGGUUAACUCGCGACUCCGCGAACUCAGACUCGCUAGAAUUGACACAGUGUUUAUGCUCAGCUACACCUAGAUUGCGCAUGAUUCGUUCGGGACGUAUAAUAUUGGCUACGCGAAGUAUAGACUGGCGUCUACCCACAGGUCAGCGUGUUUUUAGAGCGUCGUUAAGCGCUUUCGGACCUAGCGCUUCGCGACCUGCUUCUCUAUGCCGAAUCUUCCGCCGCUGGAAUACCGUUCCAUGCCACGUCUACACUAGUUGUCAGCGAGGGUUAUACAUUCCCGCGCGUCGAAGCUCUAUUUAUCCGCCCUUCGCCUGUCACAAAACUUGCAGACUCUUGGUGAUCCACAUUGAUGUCUCUCAAGGCGUGCAUACCACUCCAACGACAGAGGAUACAUUCGACUUGAACUCACGGUCCUUGCUGCAUACCUCUAAGCACAUACAGCCGUUUUAUUGUGAAGGCGACCUCUUUGUGACGGAGGAAAGGUUGGCCAGUCGAUAUGUGGAGACAGAGGUAAGAAAAAACAUCAGUUUUCCAUUUCUGGCUUUCCCUACUACGAGUCCGAGCCGAGCUAUUCAGGCAAGAACCUUGUUACAGCUUAUCCUCAUAAACUUCUACGGGAUGAAUAUGUACCCACUGAUACCUAUGUCGAGAGUUAUGCUCGAAUUUUGUUUAUUGCCCUUGCCGACUAGCCCUCCCGUCGGGUACCGAAGGAUUCCGGGAAGGAACUCUGUAUCUACCCGCCGCCCAUCACGGAGAAAGUACGGUUGCAGCGCAGUGGCAUCGCCUAAACCACACCUUUGUGGAAACAACGUCAGGAUGCUGUGGGAACUACACGUGAGCAACACUCGCGGAUCCAUUAUAUGGAGUCACCCAGCUAAUAUUCGUACGGGAUUUUCUCACCGCGCCGUGGUCAAUAUACUCCAGUAUCAACCGAGCACUGUACCAACAUGGUACACUCUAGAUAGCAGCAAUACCGGAGAUGGGCUUGAACGAGACCACAUAGGAGGCCCGAAGCUAAUGUCAAUAAAGCUGCGGCGCAUUUAUGUUCAAUGCCACAAGUGCUGCCGGAACAAAUCGGCACAGGAAACAACCAACCCGUUGAGGGAGGUGGCAGCAAGCUUGCAAUUAGCAAGAGGACCUCCAACAGUAUGCCAGACCGACCUGUCUUGUGCAUGUCGUCACCUGACGUGUAUGCCUAGCUAUCGCCCAUCCUUGUGUUCUUCCUGGAGGUUAGUCCGCAGCCGGGCUACUAACUGUGAACGUGGCGUUGAAGAACCACUAGUUUUCUUAAGUCAGGCAUUGUUCAAGUUCUACUGCGCCGUCACACGCGAAACAGCCUCUUUCACACCCUCAGUUUCUCUACGGCGCAAAGGGAUCGGCCUCGGUCAGGAACUAUCAACUUGUUCUAUUCGAUUUCCAUCUAGCACAAGCUACCCCCGCGCUCUUCUCCCCCCAACAGACCUGUUGGUGCCAACUUACUGCUCGUCCAAAGCAUCCAGCUCUAUCCAUUUUCUCGUAUUGUGGCAAGGACUCGACGGCAAUUUUUGUAAUUUUCGUCAUUGGGCGUACAUCUUCGUGCACCAAUCUAAUGGAGGUCUGGAGUCACGGGUACAUUGGAUGAUAAGGGCUCCCAAGACAACUUUACCGGAUUAUAGAAUAAUAGAGCCCCCUCAUCGUAUAAAGUGUGCUGCGCAAAACUUGCAACAUGCGAUCAGAGGUCCAACUCGACUUGCUCCAAAAACCAGAUUUCUUCGUGACCGGUCCGCAAGGUAUCCAGGAAACCGACGCAUGUACAGAACGAUGAGCUCUCAAUAUACUCGCAAGUCUUCGAGCAAAAACUCAGGGUCCAGUAUCAGUCCUGAAAAGCGUCCUAAGCUAGCUAGCUUGGAUAAACCACAGCGCAGUGUGAACAAAGGCAUCAAGCGUCUGCAAGACAGAACCCCUGCUUCCCUGGCAUAUCAUGGCCUAGGCAUAGGCGCAGCCUCGCUAAAAGUUUCCGCCACUCGUCAAAGCCAGGUCCAGACAGUCAGUCCCCAUUUCUGGCAAUAUGAGUUCCUAGCCAUCACUGACGGUCUACCCUUUCAGCAUACGCAUGAUACCACUGUAUUUAGACGACCUCGUCCGAAAAUGUGCGGCUGUAGCAGGGAUCUUGGGUCCAGUCCGCACCAUGCUAAAGCGCCAGAUUGCCUUUUACACUGGGCACGUGUGCGGGCAAUCACGCAAGGCCUCACGUGGCAAAGCUGGGCAGCAAUGGUGUCACACUGGCGGACACACGCACAAACUUUUAACGGGACUCCUAACAGGCAAGAAAACGCUAUGCACAAAUAUUUGCACAACUGGGGACGCGCCACCGAGACUAUGGUUCACAUGCUACCACGAUUCAUGGCCAUCACAAGAGGCGCCCACCCGCAGAUGCCCUGCCCAACAUGGGGCCAUCUUGCUACUCCUGCAAGGUUUGCUCAUCCAGCGUCUGAGGAAGAUGGGGGAUUAAUGCGACUAGUGUUUCAUGGAGAUCUUGAACCCCACACUUGGGUUACUUAUAAGCAUACACUAUAUUCGUAUGCCUCACAGAUAUUAAGUCAGGGCAAGCCACUUCCGAGUACGUACAUGCGUCGUCCGUUUGGACAGGCCACGUUCGUUAGCCAUGCGAUGCCUGCCGGCGACUGGCAGGAAAGAAGACAGGAACCUCGCUGCGAUGACUCAUUUAGCCCAAACGUGUUGUGUUAUUCAUUACGUCGUGAAAUAAGAGACGAGCAUCUGCUGAAAAGAAGGAUCGUUAAAUGGAUGAAGCCUGAAAUCGGUUUGACCAUAGAUGACCUCUCCCUCACUGGUCUCGUAAACAAGACAUAUAACGACCUUCAAAGUACGUGUUCCAAUCGGCGGGGAGCGUACGCGCUAAGCGCGCUCGUUUAUUCUGUCCAAUACCACGGGCCCGGGCGGACAUACUGGGCGAGUGUCGGGGCCCUAAUAGACAUUGACCUAAACUACAUGGAGAUGCUCAUCUCACGGUCGACAAUUUGUCGGGGCAUUGGCACGUAUUUCGCUAAUGGUGCCCUCGGGGCGCGGCCUAAAGUUUGUACAGCGCAUGGGAAGAAUCGAUUGGAGCUAGCUGUCAGCUUGUGGUGCCUAAAUCUAAGACAGCUCGAAAGGGGGGAUUCUAUGGUCUUGUGUUACAACUGUAUCAUCGCUGAUACGUCAACUGCUCGAAUAGAUCGUGCCCUUCCUAUAGAGUUUCUAGCCUUGCCGUCACUGUCAAGCUUGAUCAGAUGCAAUGAAGUUCUUCUCCGACGAUCCCGGUAUGUAAAAACCAUCAUGUGCUACACGAAAUUCACGCCGAUAAGCAAAUCGGUGUUCUUGUGGUCGACUUCGCGGACAAUCGAGUCUCUCGUCGUAAGCAAGUCAGCCGGGUCCCCCUAUGUGCACUACCUAUCAUUUUGGCUACUACUUUCUGAGUUACUAUAUUUAAGCAACACCGUAGCGACCAUAUAUUCCGGUCCCUCCGGGUGCACUUAUUUGUUUACAUGGCGGAAAGCUGCUCUCGAUCCGCCACUUCACUCCUUUCUCCGGCCCUAUUAUGAAAGACCGCGCUACUUUUCUGGUCCACUUAGGGACAAUCCACCUCGAAAUCCGACGCCACCACGACACAGUUUUCAACUUGCUUCCAGGUCGCACCGCCCACUUACUUCAUUAGCAAUUAUACGGUCGGAAAUAAGAACAUGGUCGGCCUCAUUAAGGCGCAUCGAUUCCCUCUCGUUGCAUGAGGCUGUUGUAAUAGAUUUCCGUCGUUGGUCAUGGGGCCCGCCCCGUCGGACAGUGGGCAUACGUCGGGCCAUCUAUCUAGAAAAACAGCGGUGGAUCCGUUGUAAUGCCGUGAUUUCUGAGGUUUCAAGGCUCCGUAGCGCAAGUGCUAGGUUGUGGUAUAAGGAGUCUCCUUCCCCCUUCAUCUGUAAAGCAGGGCGUUGCUUUGUACCCCGUAGAUAUGUUCCGAACCGCGGGCGACAGGGGAUGCUCAGCGGCAGCAUAGGUGUUUAUGCUGUCAAGCGUAGGGCCAGAACUGACGUCCGGUCGCAAAGGCUUGAGGCCACCUUGUUACUGCCACUCGGCAGCUUAAUCGCCCUGAGGACCCUAUCCGGGAUGACCAUCGCAGCUACUAACACAAGGGCAGCUUUGAUUCAGGACAUUCCGACACCGAAGAGCUCGGAGCUCGUGGGUCCGGAUGACUCAUCGGAACAUCAGUGUCAGGUAUAUCGGACUCAGUGUUCUAAUCCGCUACGGCGAACUGGGCGGAUCUGGGUAGCAUUAACGCAGACGGGGAUGGGGGUAAUGUGCCCGAGUCUUGGCGCGGUAGACUCAUACAGGGAGGUGGUCCUGCUUACAGUCUUACGUGGUAAGUACGAAAGAGUGGCCUACUGUUCCAAUAGGUGGCACGGUCGCAAUGGCAACACAGAUACAAAGGUAAGAUGCGGCAUUGAGAUUUAA"
print protein_translation(genome)
MSSNVSVVVTQSNNRLHTLNSRHEPRASKLFLDIVHLSTVDLFVLVRYLTGRESSISLARAVRLIRSPWRAEVSALCDVICKEPPSEAVSESVHDTSLCGPESTRSRRKAGRNTDARELNRIEFLRVITRQLGGVKRASGVCHLTRTMPGSCKLFNAPKIMSRVNFTWQTPDCLPGGPLLTSLLDGTVQRSRNSGVVLRMAKSRVEAVWAKTDLNQFKAIVRATTSLLGVPKWSSISRCDRTSARLISSIPSIAVTTITLHDDGITFSTPPVLPIYSGDIKGLRRRGQPRLTALPSKNYSHPCPVAPNIVVPVLMGSLAGSRLRINTPIHVACLRSAEGLSMNSLVILVDVCTVRTPAGILRGLKYTRDPRFESAGSSMNLTKCSIMMGPIRWDVNISLGDTSRALYACPTGHFARAVKHCKHYGRLCRSARTHGDVPPSNPRIDLVMSRLIRREGYRGIGLDWPRIPVEDLHISVRLPVDSPQPFFTLLKHKPARSHVCQRRTRTVGRTTDETLCCKIGCHIRRADSNTRILQPADARRTLAAVLCLAGDLIIQSVRVRGSLFKQSYAIKVPSSKSAFRPETRFRFGLANIHEDRLEELEHQDNCPWRCGGLSISRMELAINDMQGSLSLELVHIYADMFSFYSNAGLITSKAGGRQIADHLDVFLLTTNDGHYCRMGEASADESSSRQQGETLELGMRRSRILRVVVVKVYPSLRLTDDGPFRQTHDSRRITEFRSTMKGLGLRAIGWAPRTCTPTLMMLRSSTLSRSAQSNPCLVIRVDHETLAHVDSRGASGESNEQALRNNGPQAPSVTGVATVPGVGYVLNRSHLKRVEGYVCGELTSLCVGPSLATRIVLGTLAICLSYLPHGTWVRSATLGVHYGHGAETVRSRSNHIRHGILMADSSSRPGQLGIYSGYFRERRIPLRANRNRSLLLYVSSILLAQHVHRRGNCATVEGSNPLSYRSSPTRCCIIGMTTCSFSILLPANRLYGVRGTAVTFNAVGPDTSPTPSSSARLKHELQSPHSTCASRQSCCSLAKAVTWQEIQDINGGCNYCHLKGCPAARTKPSARCRFRTNIYPSAGVLSLRTSTFASPNHFQIIIRRLPVLLLVSAQRNIILSWGCRLCRRAQSSNKNFGPAIPCCGPISGKGTDKILIILKMEIPLTLPLVGAYVEIVSLRVPTKVAKALNRGKKCHITVCHSKNRRFATVTPVLSITVSVLSTEGVTRKNPRTITEKVGVREIWLTRDSANSDSLELTQCLCSATPRLRMIRSGRIILATRSIDWRLPTGQRVFRASLSAFGPSASRPASLCRIFRRWNTVPCHVYTSCQRGLYIPARRSSIYPPFACHKTCRLLVIHIDVSQGVHTTPTTEDTFDLNSRSLLHTSKHIQPFYCEGDLFVTEERLASRYVETEVRKNISFPFLAFPTTSPSRAIQARTLLQLILINFYGMNMYPLIPMSRVMLEFCLLPLPTSPPVGYRRIPGRNSVSTRRPSRRKYGCSAVASPKPHLCGNNVRMLWELHVSNTRGSIIWSHPANIRTGFSHRAVVNILQYQPSTVPTWYTLDSSNTGDGLERDHIGGPKLMSIKLRRIYVQCHKCCRNKSAQETTNPLREVAASLQLARGPPTVCQTDLSCACRHLTCMPSYRPSLCSSWRLVRSRATNCERGVEEPLVFLSQALFKFYCAVTRETASFTPSVSLRRKGIGLGQELSTCSIRFPSSTSYPRALLPPTDLLVPTYCSSKASSSIHFLVLWQGLDGNFCNFRHWAYIFVHQSNGGLESRVHWMIRAPKTTLPDYRIIEPPHRIKCAAQNLQHAIRGPTRLAPKTRFLRDRSARYPGNRRMYRTMSSQYTRKSSSKNSGSSISPEKRPKLASLDKPQRSVNKGIKRLQDRTPASLAYHGLGIGAASLKVSATRQSQVQTVSPHFWQYEFLAITDGLPFQHTHDTTVFRRPRPKMCGCSRDLGSSPHHAKAPDCLLHWARVRAITQGLTWQSWAAMVSHWRTHAQTFNGTPNRQENAMHKYLHNWGRATETMVHMLPRFMAITRGAHPQMPCPTWGHLATPARFAHPASEEDGGLMRLVFHGDLEPHTWVTYKHTLYSYASQILSQGKPLPSTYMRRPFGQATFVSHAMPAGDWQERRQEPRCDDSFSPNVLCYSLRREIRDEHLLKRRIVKWMKPEIGLTIDDLSLTGLVNKTYNDLQSTCSNRRGAYALSALVYSVQYHGPGRTYWASVGALIDIDLNYMEMLISRSTICRGIGTYFANGALGARPKVCTAHGKNRLELAVSLWCLNLRQLERGDSMVLCYNCIIADTSTARIDRALPIEFLALPSLSSLIRCNEVLLRRSRYVKTIMCYTKFTPISKSVFLWSTSRTIESLVVSKSAGSPYVHYLSFWLLLSELLYLSNTVATIYSGPSGCTYLFTWRKAALDPPLHSFLRPYYERPRYFSGPLRDNPPRNPTPPRHSFQLASRSHRPLTSLAIIRSEIRTWSASLRRIDSLSLHEAVVIDFRRWSWGPPRRTVGIRRAIYLEKQRWIRCNAVISEVSRLRSASARLWYKESPSPFICKAGRCFVPRRYVPNRGRQGMLSGSIGVYAVKRRARTDVRSQRLEATLLLPLGSLIALRTLSGMTIAATNTRAALIQDIPTPKSSELVGPDDSSEHQCQVYRTQCSNPLRRTGRIWVALTQTGMGVMCPSLGAVDSYREVVLLTVLRGKYERVAYCSNRWHGRNGNTDTKVRCGIEI
Given a protein, provide a list of RNA sequences that would translate into it.
protein_table = {}
for k,v in codon_table.items():
protein_table[v] = protein_table.get(v,[])+ [k]
protein_table["L"]
['CUU', 'CUG', 'CUA', 'CUC', 'UUG', 'UUA']
protein_dictionary = {"His":"H",
"Gln":"Q",
"Pro":"P",
"Arg":"R",
"Leu":"L",
"Asp":"D",
"Glu":"E",
"Ala":"A",
"Gly":"G",
"Val":"V",
"Tyr":"Y",
"Ser":"S",
"Cys":"C",
"Trp":"W",
"Phe":"F",
"Asn":"N",
"Lys":"K",
"Thr":"T",
"Ile":"I",
"Met":"M"}
tyrocidineB1 = "Val-Lys-Leu-Phe-Pro-Trp-Phe-Asn-Gln-Tyr"
t = tyrocidineB1.split("-")
t = [protein_dictionary[p] for p in t]
tyroB1 = "".join(t)
t = [len(protein_table[p]) for p in t]
# Number of DNA strings of length 30 transcribe and translate into Tyrocidine B1:
reduce(lambda x, y: x * y, t)
6144
Find substrings of a genome encoding a given amino acid sequence.
Input: A DNA string Text and an amino acid string Peptide.
Output: All substrings of Text encoding Peptide (if any such substrings exist).
Note: The solution may contain repeated strings if the same string occurs more than once as a substring of Text and encodes Peptide.
def look_for_protein(gen, p):
output = []
g = DNAtoRNA(gen)
while g:
if detect_protein(g, p, 0):
output = output + [RNAtoDNA(g[:len(p)*3])]
g = g[1:]
return output
def detect_protein(g, p, count):
if len(g[:3]) != 3:
return False
#print g, count, p[count], codon_table[g[:3]]
if count + 1 == len(p) and codon_table[g[:3]] == p[count]:
return True
if count + 1 < len(p) and codon_table[g[:3]] == p[count]:
return detect_protein(g[3:],p, count + 1)
else:
return False
def look_around(g, p):
# Forward
l1 = look_for_protein(g, p)
# Backward // I'm guessing this could be done on the forward direction too...
l2 = look_for_protein(reversecomplement(g),p)
l2 = [reversecomplement(l) for l in l2]
#for l in l1+l2:
# print l,
return l1+l2
geno = "CACCTACTTATGTAGAAGTTCGTTCACGTAGCGAACGCGCTGTAGCTACATTGTTCCCGACTCGTCATCATGAATATTTTCATTTTATCTCCCGCGTCTGGTGTCCGTTCAGTCATCAATCCTCCTAATTGGTACCACGCTAAGCTGTGTGCCTCCTCGCTGGATAACCATACGCCGAGGTTATTTCCGACGGCTTCCGCTCGGATCCGGCTGTCGGCAACGCACTGTATTAGTGCAAACCCAGCGGAGGTCTTGTGCTAAGTCTATGAGCCCATGTACTATACGAAGTTTGGGCACTGTCAAGAAGATGCCGGGAACCGCTACTAAAGGAGTTATAGAAGGGAGCCGTGTGGTACCCCGCATGGCGCTTAGGCGAATTCAAATGGTCATGCGATCACAAATATCCCGCCCAATTACTGGGATCGCGTTACCTACTATGAAATCGCCTGCCTCAGTTCACTTAGCTAAGACAGTTCTATTAGGAGTTGACAGAGGCTAGTAGCAGTAGACGTAATTTGAGGTGGGGAGGCGACCCCCGGTTATCCGTTCCCGAGCCTTCCATCCTGTCACCAACCCTTCCCGGCAGTTAAAGAGCTCTAGTCCGACATTGTATGAGGACAATCTTTGAACTGTCACTCGACGTCTAGTAGCTAGGGAAAGGATTGCGTGTAGCTTATGTGTTCCAAGTCCATGAAATGATAAAGCGTCACCTCAATCAGAAAGGCCTCAGAACGACAGTGCGTAGTGCTAGCCCCTTGATCAGTTTATAAACTCAAACACTTGGCCCAGACGTGCAGAGGTTCATTGAGCCTAGAGTTCTCATGCTAGGAAGACAGCTAGTAAAATCGGCGTGGTTGTTGGAACACACGAAGCACTGCCCTCGAGCCAACCTAGGAGATCTCTATCCAGACGAGAGTAATCTTACCGCGGGTCGTTGTGTCATATATACGGAAACGCAAACATCAAGGTCACCATACAAAAATGAAGATATTCATGATGACCTCGAGACTTCGCGTTGAATGGCTTGAACAGACTAGACACTATCATGGCACCTCCATCACCGTCAAGCTTAAAGGTATTCCTCAACTCGTGGTGCGAAAGTTGATGGTGTCAGCCGACCCAGGATACAGCGCGGAGCCGAAAGTCTGTTACGCGCTACCAGGGCTTTGAACGGCTTGTCATCATAAAGATTTTCATCTTTCTGGAAGTCATCATGAATATTTTCATCTTGTTCTTTCCACAGTGTAACCACTGTCAAACAAAATGAAAATTTTCATGATGACAAGTAGGAGCCGAGGTCACTGGACCCTGATAGCGTATCGATCTACTGGGGGGGCTGTTCCACTCCGGCGGGGTGGTCTGCAGCAAATTGTATGTTCGAAACTCGCGTTCTGACTAGAACAGATTGGGTGGCTAACATAGATCGTGTTAGCGTGTGAGGGTCAGGAGAGGCCGGGTCTTCTGCATTAAAAAGAGTGTGCGGCGATGGTTGCCGTAGGTCTGCCTAGATATCTATATGGAGTCGAACGTCTCACCCGATCGCACCCATGCATATAAAACTTAAAAGTGGCTCATGCAATATGAGGGCTCGGCCCCCATTTTGCTTCACCTTAGTCTCGGAGACGCCCATAGACGCTTGAGTCAGCGCGCAGTGGCAGTAGCGACCGATCGAAGGCCGCTCTCAGTTTACAAGTTTCACTGTCTTATTAGCCCATGGTAGACGGTTATCTCGGACGGCGAAGCATCTTAACGTTTCTGCCTATCGAGTTATACGTCAGGCCAGGTGGGTAGGTCTCGAAGCCTTGCGCCCGGTCACAATCGCTCCCCTTCTATACACAAGATGAAAATTTTCATGATGACGAGTCGGACGTCTATCTGGATCGCGACCCATTACCGCCTGGTTCCTACTGGTCATCATAAATATCTTCATTTTTCAACCGATCCACCGTACTTAATTGTAAAAAAAGGAAAAGTAGTTGGCTCGCCCCTACGTCGTAAGATGAAGATTTTCATGATGACTTCCAGGTAAAGTTCCTAAACAGGGGGACGCATCACGAAGCCTTTACGTCGCGAGAGGAATATTCACTTGTGGCGACAAGGCGCGTGATGAGGCGCGTTGAGTCACATCGTGGGTGCTGAACCTCCACACACGACGTAGGAATGCGATTCCGCGCAGCTTGGGGAGTAGTGGTCTGAACGTCATTCTAGGGAAGGTTGATGGAACCCACGGGGTATCACGTTTGGATTCTAAGTAAGTTGGCGAATACCCGTAATGGCAGCCAATGGGTGGCGCTAAGATGAAGATCTTTATGATGACCTCCCGCAAAGATATTCGCGCGAATATAGGGGCACCCCTCCGCACCTTCCAAACCCTTTGGGCCCGAAAGTTTAGCCGAGGTCACTTCAGGGAAGTATTGCTGACCAGCAATGGTAACTAAACTAGCCATAATCGGTTGGGCTTCGAGGTGTCAACGCGTAAAAGTTATACCTCTCCGGTCCGAGTGTCACAATCTGCCCCGTTGAACGACAGCTAGGGTTCCTTCATACTTGATTGCTGCAGTACTATACCTTGGAAGCGGGTCATATGACCAATTCCCATGCAGACGCTGAGTCCGCAGCTTGGCCTCGGATGCACCAGACGGTCCGGGTCTCACCCTTTTATTTCCCCGTGGTACCCTCGGGAGTCTAAGGTATGGTCTTTGTTGGCAAACAGTCGACCGATACGCGGCCCCTATAACGCACCACGCGTGAACTTGGACTCTCCCGTAGGACGCCGGGACCTACATCAGTCTGAATCTAAACTTTCACGTGCCGTTAAGCCTCTGCGCTACGGTCAATGTAGGGCCTAAGCCCGGAAAACGCATTACTGACAGGGAGTGGTGTACGAAATTGCGTCTACAAGGTGCCACCGTACTCGGGATTCCCTACGTCGCGCCCGCATTTGACGATTTTTTTGCCCTGATGCCACGCGGGTCTCTAAGGAGGCAGAGACGAGAAATAAATCGGACCCTTATGGGCGCTATACTGCTTGAAATAGTTACTGTACTGCGTCGCCTGTATCGCACGTAGCGTAAGGGACCATCCGAATACACGTACGTTGGCCCAATATCAGTGCCTCTGCTTGCGCCAGATCCGCAGTATGGTGAAAGGATACGAACAGAATACTTTGTCTATGTCGGCCACGATATACGCTGCTCGTTAGCTTTGATCCGCGTGTTTATCTTCGCTGCGATGTCATGAATGACATCAAGTAAGTAACGATAACCCTGGAAAGGAAATTCGAGGTGTTCACCTCATGTGTTAAGAGATCCTGGCCTTCGCGATGTCATCATAAATATTTTCATCTTCAGTGAGTCCCTCGGCGTCTCACTTCCAGCTTTTTGACAGGCCAATAAATTAGATTCATCGTACAAATAGCCGGCCCTCCACGAAGGAACCAGGTAGAGTCCTTGAATTGGCACTACGGGCCGGGGCCGACCTAAGTGGCAAAGCTTGAGGTTAACTTTATAAGGCGCAACGTCTTCCAATTGACCTCTACACGTTAGTCGAAACCAACAATATTAGACTTTGTCAAGTGCACGACGCAGTTCGCGACGAGCTAGGTATACTCAGTTACTACAGAATGCTGGTCAGAACCGATATTACCCGACTTAAAATTAGATAGCACGTGTCTGCTCCCGTGCTCGAGCCCGTGCACCCTGTACGAAGTTATAGATTTCGTATGATTTAGAGGTGGTCCCACCATGTTGGTGAAAATGAAGATCTTCATGATGACTAGCCGAACTGGCGGGGAGCCCGAGATGTCATCATAAATATTTTCATTTTACGTTCTAGTCTAACAGTATGGGGGGCAGACGAGGGCTATTACGCACGTCTGTCTGAGGATCGCCGCCTTGCGCCGACCACCTGCCTAGAAGTCGAAGCGTAGCCAGCCGGCCAATGCGCCTCTATAGCCTGATTTCATTTCATCCAGAGATTCGAGTGCCCTAGGAGTAGAACCAAAACACACGCGCTTCGGGACTACGTGAGGGAATAATGTTAGGGGTGCTTGGTCTTGATTGTGTTAGGCTGTTAGCCAATTTTCCATTGTTAGTTTGGTTTTGTGCAGTGGTGGGGAAAGATCATGATTATCTACTCCTGTTCGTCGGAGAGATAACTGAATAACCGACACAAGCTCAAATTCCGGCTGAGTAGCTACTGACAACTCTTGACGGCGTCAGATGGAGCTAGGTCGGTTTTATTGGGATGGCGTTAATCCACCGTTGAGCGTACAGTCTCGTTGCCCCGTTTTGCCCACAGATCGTCTCTCGTCCCTGAAATCACTACTTACTATGAGATGGATCATTACCTCTGTACCCTTCAGTATTAACGGTGACAAATCCCTGAAAAGGTGAAATAGGCCACATCCCCGCGTAAGGATGAGTCCATACCTCGACTTGGATTATTCCCCCGAAGCAAGTCGAGTGTCGCAAACGGCGCGGCGAGCGCAGCCAATAGTCGCCGGTACGACGCTGAGTGGACCTGCTTACTATTCCATTCTTACTTCCAGCCCTAACGCGATGTTCTTAACCGACAGTCCATCGGAAGTGCAATTGGGGTGATTCTATTACTATAAGGATTTTCCCGTAGACTCGCACATCGATCGTCTTACCAAAAAAAGACGCTTTGGCCCTCGAAGTCATCATAAAGATTTTCATCTTCTGTCTGAGCCTAGCAAGATATAGGAGCTCTGGACGGTCGAGGTAGTATGTGATGCGTTATGCGCCACTGTCCTCTAAACTTCTAATACCGACTACTGAGCTCATAGAATGACGACGCGGCATTGCATTGGCAACAAGTCTCTCGGCGCTGTTTATAAGAAAGCGGACGACAGGCCCATAAACTGACCATTGGCGCCAACCATACGACTAGATTTTCGCCGACCAAACAAGGCCCCCGGCTAGTCATCATAAAAATCTTCATCTTCGACCGTTAAGGTTGGCGTGCCAAGATGAAGATCTTCATGATGACATCAAGAACCTGCAAGAGGTTGAAGTAGTATTATGTGACAGTAGCCTCTTCTTGGTCAAGGAAGCCCTTCTTGTCCCACTTAGATGCACTCCAGCTATGGGACATGCGACTTGGAACAATGAACCCGGTACACCGGAATTGACTTCCTATAGTATCGAGAATCTGGCATCAACAACCCAGACATTGCGCCCAATTGAATCTACCTGCTTTGCTTCGGAGGGGACACCATCTGGCTTCCTGAGACTAACGACGCTTTTTTGCGTAAACGGAGACCTCTGTAGGTATATGTACAAAAAAGCCGTAACTCCCGCGTTACTCTGTATGCTGCCAGGTAGGTCATCCATAATTGTAGCCCTACAGTCATTAAGCGATCGGAGAAATGAGTTGTCGCATGCTCTCGCGGGAAGTTGCGCGGCAATCAATACTCTGAACATTCTCCCCTGGTCCATTTGCTGTCGATCGCTATCAATCCATAGCATATGGCTTCGCGGGCTCAAACAATCAGGGCTGGTGATGTTTCCGACACTGTCGCTGTACTAAACTTGGATCATCCTCCCGTGACGTCCTTAGCTGTACGGTTGACCAAAGAAAAGTTAAATCGGCTGTTTTGTGCTTAAGTCTGCCACCCTGCTAGTCATCATGAAGATTTTCATCTTGCGTGCTAAGTCGCTCTCAACGGGAATCGAGGAAAAGTGTAGTCTACTCCAGCAGTTTCTCATCTGGGACGCCCGAGCAATCCGTTCGAGGGGTGGGTTGCAAGTCCAGACCTAATAGGTTATCAACGCGATGTTATCATAGTCAGATTAACAGAACCGTACGTGGAGACCAACGTCCGCGGCCCTCCGCTAACTTTATGCACCTATACACTTAGAGGCCCGATTCAACCATCCAATCCGTGTGGTCAGACGAGAAGGAAGATGAAAATCTTCATGATGACTAGTCGGTCGGCACAGGCAACAAGTCTACGTCTCATTTCTCCTCCGCAGCACCATACCCGTGCTCTTCCCACAGGAAAACAAGCTTAGAATCTCTAGACGTCGAGTAGCGCTATTCGGGTCGCGAGGTCATCATAAATATCTTCATTTTTTAATGCCCGCGCTCGGAATTGGCCCATCACATGTGACGACGCTATGCCGCCGCAACTCGTTAATAAAGTTCTACTTTTGAAGCCTTGTCGGCCTCGAAGGTAAATACCAGTTCATAAGAGTGCTCGTGTCAAGATGAAGATCTTCATGATGACCTCACGTCTTAATAGCCGTTTCGTCGATCACGGCAACGACTCTTGCGCCAGGCCTAAGTGTAAAGACGTCCCATCAAGCCTCCTTTGATAGTATCTAAACATGAATTAATGGGGCGACGCCATCGGATATTATGCGGGCATCCTTACTGGCTTATCGGGGGGCTGTTGCGTGCCATCCACAGCCTTTCGCGATGGTTTTTCAAACCTATCTTACTTTTTAGTTATTATCAGATCGTCACTTATTCCGGGTATTACGTCACCGGCAGCAACGCTAGGACCGCCAACATCTGGTATATGAAGTCACACATGAACTAAACTACCGGAGGGCAAGCGATTAAGGGTGGGCGGTACTGCCCCAGAGACTTATCTTCGTGACCCTACTAATCATATCCAGCTCCCCGGTAAACTACCACGATGGGACGCGGGTCACTGCCATTTCTTAAGAATGTAATGTGGCATCCGTCTCCATTCCTTCGGAATCCGTGAAGTAGAGTCCGCAGTACTGAGGAGGGATGTATCATTGAACCGAATGGAGAGACGATGGCGTGACGTCATCATAAAGATTTTCATCTTAACCCGCAGAGGCGGAGACCCAGTTCATTAGGACTTCACCGTGCATGGAGTTACGATTCCTCGAATTAACCAGAAGGTGTTCTTTTTCAGTGTATCTGTGTCTAGAGCTCCTAGATTTCGTGTAGGCCTGACATCTCTTTTGAATGATAGGCGAAGGTAACACTTTCCCGCATTGAGTCTTTGACACAGTCCGTCGTAGATCCGCCGGGGGCCGCAACGAGTACGCAGTTGAGGTGCGTGCATCATTCGACGAAAGTTCCGGCAAAATGAACCGAGTCCTCGGCCGCAGGCCAGCAAGTGTTCAAGCGTGTAGACGTTTCGAGCTTATCGCCCGTTAGAGCTCGTCTCCGAGTTACATAGCTTACCTGTCTGTGACAGCCAAGTGGGGGAAGTAAGTTGGACGTGAGTCCGAAACGTCGTGTCGCCACCTCGTAAGAACACAGGGTGAAGTTATTAGCCAAAAAAAGCGGCCCTAGCCTCAGATAATAAGTCATGACCCTGGATCGGTGCCCATGATATACTACTTGTAGGATCACGAGTTTGATATCATACGCCTTTGTCTTCTGATCAGTACGCTCACCTATTCGTGACACCGTATTTAGACTCGAAGTTATTCATCAGCGTCTGCTCATGTGAAATGAAACCCAATGGGTCGCGCAAGAGGTAGTGCGGACCTCGGTACAGTAGGAATGCATTCCCACGTTCCCTTTCGCAAACCCGCGTTCATCACATGACATTCCCAACAAGCATGGTATGATCCATTTGTAACACCGACCACCTGGATGGGACCCCCCTCTTGGATTTGAGTCTAGCGTCCACCCTGCCTCGAGGCGATAGTTGGTCTTCGCTCTTAATGCACTTAAGTAGGTCAAGGTAAAGTACGTAGTATAAATTGCTCCGTCCATCAATTAGCTGCCGATTGCTCTGTTCTTCTGAAACAGTTCGCTATGTAACCGATTGGTCAAACTAGAGATGGGCTGGAGCGGGGCCGCATTCTATGAGCTGCGTGGGGCCCATCGTCTGATGATCATCTGTCAGAGGGTAAGCCATCTAGAAGCCTCAGGTACAGCAAGAGACAGGGGTATAATTTGATACCGCTGGGTAAGTAAACTCTTCTGACATTGATCTATTCCTATAGCGGGGGTCCATCGACGGGACTGTTGCCCTGACCTTGATAGACACTGGCTGAGTGCCCAGGCCAGTATAGGGGTATTCGGTATGGGTCATTATTTTTTACAGCGAATAAAGTACAAACCATCGCGATGCGAGTTCATGTCTGTACTCTCGCAGAGGAGAGCGTCCGGGCTCCTCGGCGTAATCAGCTTACCTAAGGATTGATCAGCCATTCCCCCAGGAAATGTCAGAGTCACGCTTGTTTAGTTCTTCAGGAGTACGAATGAAAGTGCTCTGTGCCCAGATTTATGGGGTATCTCTGAGAGGTACAGGGACCGCAATTGCGTAGCGGAGGAGGATTCGATGGGAACTGTGTTGGGGAACTCACCTAAGTCTCAAAATCGGTTTTGTTGTAGATTGTAGGTTTGAATAGAAGCTCGCGCCTGGCATCAGCTGTTACCAGGCCACGCAGCTTCACTCATGGTCCTGGATACACCCCGTTTTCAGCCGTCGTCAACAATTATCCAGCCAGGCCGTCTCTGCACGATGCGGCACCTGACAGCTCAGGGCCCATCTTTTGTTTACTGTAAAACCTAGGGTCTGAGTGCGACCTGCCGAAAGCGCCTCGTCTAGCCTAGTGAATACGGACAACGGAGCCCGCTGCAGTGGTTACCTTAACTCCTTGGATGAATCCGTGGTCGAGAGTGTAATTAATCTGGCGTGCTATTGACGACGAATTTCACTGTTCATATTGTCTCGCTTTGTTAGCGGGGCACTGCCAAAATGCCTAGTACACCAAGGTGGAGGGTCGTGTCGGAATTGATCAGCCACTAACGCTCCTTACGTTGAACATTGCAGC"
prot = "KMKIFMMTSR"
look_around(geno,prot)
AAAATGAAGATATTCATGATGACCTCGAGA AAAATGAAAATTTTCATGATGACAAGTAGG AAGATGAAAATTTTCATGATGACGAGTCGG AAGATGAAGATTTTCATGATGACTTCCAGG AAGATGAAGATCTTTATGATGACCTCCCGC AAAATGAAGATCTTCATGATGACTAGCCGA AAGATGAAGATCTTCATGATGACATCAAGA AAGATGAAAATCTTCATGATGACTAGTCGG AAGATGAAGATCTTCATGATGACCTCACGT GCGTGACGTCATCATAAAGATTTTCATCTT TCGCGAGGTCATCATAAATATCTTCATTTT CCTGCTAGTCATCATGAAGATTTTCATCTT CCGGCTAGTCATCATAAAAATCTTCATCTT CCTCGAAGTCATCATAAAGATTTTCATCTT CCGAGATGTCATCATAAATATTTTCATTTT TCGCGATGTCATCATAAATATTTTCATCTT CCTACTGGTCATCATAAATATCTTCATTTT TCTGGAAGTCATCATGAATATTTTCATCTT ACGGCTTGTCATCATAAAGATTTTCATCTT CCGACTCGTCATCATGAATATTTTCATTTT
['AAAATGAAGATATTCATGATGACCTCGAGA', 'AAAATGAAAATTTTCATGATGACAAGTAGG', 'AAGATGAAAATTTTCATGATGACGAGTCGG', 'AAGATGAAGATTTTCATGATGACTTCCAGG', 'AAGATGAAGATCTTTATGATGACCTCCCGC', 'AAAATGAAGATCTTCATGATGACTAGCCGA', 'AAGATGAAGATCTTCATGATGACATCAAGA', 'AAGATGAAAATCTTCATGATGACTAGTCGG', 'AAGATGAAGATCTTCATGATGACCTCACGT', 'GCGTGACGTCATCATAAAGATTTTCATCTT', 'TCGCGAGGTCATCATAAATATCTTCATTTT', 'CCTGCTAGTCATCATGAAGATTTTCATCTT', 'CCGGCTAGTCATCATAAAAATCTTCATCTT', 'CCTCGAAGTCATCATAAAGATTTTCATCTT', 'CCGAGATGTCATCATAAATATTTTCATTTT', 'TCGCGATGTCATCATAAATATTTTCATCTT', 'CCTACTGGTCATCATAAATATCTTCATTTT', 'TCTGGAAGTCATCATGAATATTTTCATCTT', 'ACGGCTTGTCATCATAAAGATTTTCATCTT', 'CCGACTCGTCATCATGAATATTTTCATTTT']
Generate the theoretical spectrum of a cyclic peptide.
Input: An amino acid string Peptide.
Output: Cyclospectrum(Peptide).
Note: Cyclospectrum(Peptide) is the collection of all of the masses of its subpeptides, in addition to the mass 0 and the mass of the entire peptide.
def subpeptides(pep):
p = list(pep)
n = len(p)
output = [pep]
p_extended = p + p[:n-2]
for i in range(n):
for j in range(n-1):
output += ["".join(p_extended[i:i+j+1])]
return output
#import urllib2
#url = "https://stepic.org/media/attachments/lessons/98/integer_mass_table.txt"
#table_file= urllib2.urlopen(url)
integer_mass_table = {}
file = "/Users/javier/Documents/Data/stepic/integer_mass_table.txt"
for line in open(file):
l = line.split()
integer_mass_table[l[0]] = int(l[1])
def mass_subpeptide(pep):
count = 0
for x in pep:
count += integer_mass_table[x]
return count
def cyclospectrum(pep):
output = [0]
subs = subpeptides(pep)
for s in subs:
output += [mass_subpeptide(s)]
return sort(output)
for w in cyclospectrum("RRNWLVERNEYIT"):
print w,
0 99 101 113 113 114 114 129 129 156 156 156 163 186 212 214 228 243 257 270 270 276 285 292 299 300 312 341 370 377 384 398 399 399 405 406 413 413 426 456 497 498 506 512 519 526 527 527 528 533 562 569 611 612 620 627 640 641 662 668 675 683 689 691 713 725 740 776 776 790 797 797 797 803 804 818 824 826 826 903 903 905 911 925 926 932 932 932 939 953 953 989 1004 1016 1038 1040 1046 1054 1061 1067 1088 1089 1102 1109 1117 1118 1160 1167 1196 1201 1202 1202 1203 1210 1217 1223 1231 1232 1273 1303 1316 1316 1323 1324 1330 1330 1331 1345 1352 1359 1388 1417 1429 1430 1437 1444 1453 1459 1459 1472 1486 1501 1515 1517 1543 1566 1573 1573 1573 1600 1600 1615 1615 1616 1616 1628 1630 1729
Compute the number of peptides of given total mass.
Input: An integer m.
Output: The number of linear peptides having integer mass m.
aminoacid_masses = sort(list(set(integer_mass_table.values())))
def peptides(n, d):
for m in aminoacid_masses:
if n-m in d:
d[n] = d.get(n,0)+d[n-m]
return d
def pep_counter(M):
dicc = {0:1}
mn = min(aminoacid_masses)
for i in range(M-mn+1):
j = i+mn
peptides(j,dicc)
return dicc
dicc[1351]
115361010962811
pep_counter(1600)[1600]
109000330554114810
l= []
d = pep_counter(3000)
for i in range(3001):
if i in d:
l += [d[i]]
else:
l += [0]
import matplotlib.pyplot as ppl
fig = ppl.figure(figsize=(15,10), dpi=100)
ax = fig.add_subplot(1,1,1)
line, = ax.plot(l, color='red', lw=1)
ax.set_yscale('log')
show()
This figure suggests that for large $m$, the number of peptides with given integer mass $m$ can be approximated as $k · C^m$, where $k$ and $C$ are constants. Find (estimate) $C\pm 0.002$.
from __future__ import division
d = pep_counter(3000)
Observe that if $y(n) = k C^n$ then $k = \frac{y(n)}{C^n}$. Thus, for $m > n$, $C^{m-n} = \frac{y(m)}{y(n)}$.
Let's take $m = n+1$ for $n$ relatively large and calculate $\frac{y(m)}{y(n)}$:
for i in range(100):
print d[3000-i]/d[3000-(i+1)],
1.02777962601 1.02779331201 1.02779428496 1.02778231924 1.02775955862 1.02773032568 1.02770013234 1.02767472373 1.02765903188 1.02765601258 1.02766645462 1.02768829251 1.02771759738 1.02774867844 1.0277757404 1.02779351957 1.02779858992 1.02778992341 1.02776902019 1.02773987932 1.02770791302 1.02767928759 1.02765943249 1.02765220082 1.02765908721 1.02767878304 1.0277077306 1.02774033268 1.02777053075 1.02779243832 1.02780189415 1.02779699297 1.0277785381 1.02775005315 1.02771677939 1.02768518924 1.02766116829 1.02764947875 1.02765233288 1.02766931969 1.02769729755 1.02773094467 1.02776397124 1.02778997316 1.0278040373 1.02780334219 1.02778793293 1.02776070459 1.02772666455 1.02769243284 1.02766433638 1.0276479945 1.02764638532 1.0276600869 1.02768644992 1.02772060025 1.02775606151 1.02778604682 1.02780486561 1.02780878432 1.02779700796 1.02777167171 1.02773747264 1.02770099891 1.02766901421 1.02764788546 1.02764144249 1.0276512788 1.02767536519 1.027709408 1.02774683063 1.02778060095 1.02780423749 1.0278131335 1.02780555232 1.02778277459 1.02774907702 1.02771084436 1.02767525373 1.02764927738 1.02763770042 1.02764309867 1.02766424412 1.02769749992 1.02773633972 1.02777359773 1.02780202929 1.02781620633 1.02781334576 1.02779381565 1.0277613205 1.0277219015 1.02768307655 1.02765228316 1.02763534635 1.02763575891 1.02765330615 1.02768503318 1.02772468255 1.02776502261
peptido = [186,128,113]
CYCLOPEPTIDESEQUENCING(Spectrum)
List ← {0-peptide}
while List is nonempty
List ← Expand(List)
for each peptide Peptide in List
if Cyclospectrum(Peptide) = Spectrum
output Peptide
remove Peptide from List
else if Peptide is not consistent with Spectrum
remove Peptide from List
Note 1: Expand(List) is a new collection containing all possible extensions of peptides in List by a single amino acid mass.
Note 2: A linear peptide is consistent with Spectrum if every mass in its theoretical spectrum is contained in Spectrum.
def expand_peptides(p):
output = []
for m in aminoacid_masses:
output += [p +[m]]
return output
def subpeptides_n(pep):
p = list(pep)
n = len(p)
output = [pep]
p_extended = p + p[:n-2]
for i in range(n):
for j in range(n-1):
output += [p_extended[i:i+j+1]]
return output
def lineal_subpeptides(pep):
output = []
n = len(pep)
for i in range(n):
for j in range(n-i):
output += [pep[i:i+j+1]]
return output
def spectrum(function_spectrum, pep):
subs = function_spectrum(pep)
return list(sort([sum(x) for x in subs] + [0]))
def compatible_specs(can, spec):
l = list(spec)
s = spectrum(lineal_subpeptides, can)
for x in s:
if x in l:
l.remove(x)
else:
return False
return True
def expand_list_pep(l):
output = []
for x in l:
output += expand_peptides(x)
return output
def secuencia(espec):
candidatos = [[]]
salida = []
while candidatos:
candidatos = expand_list_pep(candidatos)
salida += [c for c in candidatos if spectrum(subpeptides_n, c) == espec]
candidatos = [c for c in candidatos if compatible_specs(c,espec)]
return salida
espectro = [0, 113, 128, 186, 241, 299, 314, 427]
secuencia(espectro)
[[113, 128, 186], [113, 186, 128], [128, 113, 186], [128, 186, 113], [186, 113, 128], [186, 128, 113]]
espe = "0 71 97 99 113 114 128 129 131 147 163 168 202 213 227 242 244 246 257 291 294 299 315 326 343 356 360 365 370 414 420 422 446 455 457 462 473 484 493 528 533 545 551 570 583 590 602 609 622 641 647 659 664 699 708 719 730 735 737 746 770 772 778 822 827 832 836 849 866 877 893 898 901 935 946 948 950 965 979 990 1024 1029 1045 1061 1063 1064 1078 1079 1093 1095 1121 1192".split()
espe = map(int,espe)
resultados = secuencia(espe)
for r in resultados:
print "-".join(map(str,r))
71-97-147-99-114-113-129-128-163-131 71-131-163-128-129-113-114-99-147-97 97-71-131-163-128-129-113-114-99-147 97-147-99-114-113-129-128-163-131-71 99-114-113-129-128-163-131-71-97-147 99-147-97-71-131-163-128-129-113-114 113-114-99-147-97-71-131-163-128-129 113-129-128-163-131-71-97-147-99-114 114-99-147-97-71-131-163-128-129-113 114-113-129-128-163-131-71-97-147-99 128-129-113-114-99-147-97-71-131-163 128-163-131-71-97-147-99-114-113-129 129-113-114-99-147-97-71-131-163-128 129-128-163-131-71-97-147-99-114-113 131-71-97-147-99-114-113-129-128-163 131-163-128-129-113-114-99-147-97-71 147-97-71-131-163-128-129-113-114-99 147-99-114-113-129-128-163-131-71-97 163-128-129-113-114-99-147-97-71-131 163-131-71-97-147-99-114-113-129-128
Input: Integer N and a collection of integers Spectrum.
Output: LeaderPeptide after running LEADERBOARDCYCLOPEPTIDESEQUENCING(Spectrum, N).
Pseudocode:
LEADERBOARDCYCLOPEPTIDESEQUENCING(Spectrum, N)
Leaderboard ← {0-peptide}
LeaderPeptide ← 0-peptide
while Leaderboard is non-empty
Leaderboard ← Expand(Leaderboard)
for each Peptide in Leaderboard
if Mass(Peptide) = ParentMass(Spectrum)
if Score(Peptide, Spectrum) > Score(LeaderPeptide, Spectrum)
LeaderPeptide ← Peptide
else if Mass(Peptide) > ParentMass(Spectrum)
remove Peptide from Leaderboard
Leaderboard ← Cut(Leaderboard, Spectrum, N)
output LeaderPeptide
Note 1: ParentMass(Spectrum) = max(Spectrum)
Note 2: Score(Peptide, Spectrum) is the number of masses shared between Cyclospectrum(Peptide) and Spectrum.
Note 3: Cut(Leaderboard, Spectrum, N) returns the top N highest-scoring peptides in Leaderboard (including ties) with respect to Spectrum.
def mass(pep):
return sum(pep)
def parentmass(spec):
return max(spec)
def score(pep, spec):
s = spectrum(subpeptides_n, pep)
sp = list(spec)
output = 0
for x in s:
if x in sp:
output += 1
sp.remove(x)
return output
def cut_leads(leaderboard, N, spec):
leaders = sorted([(p,score(p,spec)) for p in leaderboard], key=lambda k : k[1], reverse=True)
return [p for p,s in leaders if len(leaders) <= N or s >= leaders[N][1]]
def leaderboardsequencing(spec, N):
leaderboard = [[]]
leaderpeptide = []
leaderscore = 0
pm = parentmass(spec)
while leaderboard:
print 'Leader length: %s' % len(leaderboard)
leaderboard = expand_list_pep(leaderboard)
print 'Leader expanded: %s' % len(leaderboard)
leaderboard = cut_leads(leaderboard, N, spec)
print 'After cut: %s' % len(leaderboard)
top = score(leaderboard[0], spec)
if top > leaderscore:
leaderpeptide = leaderboard[0]
leaderscore = top
lastleaderboard = list(leaderboard)
st = 'Top: %s \nScore: %s' % ('-'.join(map(str, leaderpeptide)), leaderscore)
m = len(st)
print st
print "="*m
leaderboard = [p for p in leaderboard if mass(p) <= pm]
return leaderpeptide, lastleaderboard
spl = "0 71 113 129 147 200 218 260 313 331 347 389 460".split()
spec = map(int, spl)
x = leaderboardsequencing(spec, 10)
print "-".join(map(str, x))
Leader length: 1 Leader expanded: 18 After cut: 18 Top: 71 Score: 2 ================= Leader length: 18 Leader expanded: 324 After cut: 16 Top: 71-129 Score: 4 ===================== Leader length: 16 Leader expanded: 288 After cut: 12 Top: 71-129-147 Score: 7 ========================= Leader length: 12 Leader expanded: 216 After cut: 12 Top: 71-147-113-129 Score: 13 ============================== Leader length: 12 Leader expanded: 216 After cut: 14 Top: 71-147-113-129 Score: 13 ============================== 71-147-113-129
spec = "0 71 97 101 103 113 113 113 113 114 114 115 128 128 128 128 129 131 131 131 156 156 184 186 186 200 214 227 227 228 230 231 241 242 242 243 244 244 256 257 262 269 270 287 298 299 301 328 331 340 340 343 345 345 356 358 359 370 370 372 375 383 385 397 400 401 429 430 442 453 454 454 459 462 468 471 472 473 474 485 486 487 498 499 501 512 514 514 542 561 567 570 573 575 581 583 585 590 599 600 600 601 602 610 615 615 616 627 627 630 658 695 696 698 698 698 701 703 704 713 723 728 728 728 728 730 730 731 741 744 747 758 761 769 799 810 817 827 829 831 832 841 841 844 844 851 854 854 857 859 862 872 882 884 886 889 928 928 944 945 947 955 955 958 959 960 966 967 972 972 982 985 990 996 997 1000 1000 1003 1041 1056 1059 1062 1068 1068 1068 1073 1075 1075 1084 1087 1089 1095 1097 1103 1113 1114 1128 1128 1131 1152 1172 1172 1181 1182 1184 1189 1190 1190 1196 1197 1199 1200 1202 1210 1212 1227 1231 1242 1259 1259 1283 1295 1298 1303 1303 1303 1303 1304 1311 1312 1317 1318 1325 1325 1328 1330 1338 1340 1345 1355 1356 1388 1396 1416 1426 1426 1427 1431 1432 1432 1434 1440 1442 1443 1445 1451 1453 1453 1454 1458 1459 1459 1469 1489 1497 1529 1530 1540 1545 1547 1555 1557 1560 1560 1567 1568 1573 1574 1581 1582 1582 1582 1582 1587 1590 1602 1626 1626 1643 1654 1658 1673 1675 1683 1685 1686 1688 1689 1695 1695 1695 1696 1701 1703 1704 1713 1713 1733 1754 1757 1757 1771 1772 1782 1788 1790 1796 1798 1801 1810 1810 1812 1817 1817 1817 1823 1826 1829 1844 1882 1885 1885 1888 1889 1895 1900 1903 1913 1913 1918 1919 1925 1926 1927 1930 1930 1938 1940 1941 1957 1957 1996 1999 2001 2003 2013 2023 2026 2028 2031 2031 2034 2041 2041 2044 2044 2053 2054 2056 2058 2068 2075 2086 2116 2124 2127 2138 2141 2144 2154 2155 2155 2157 2157 2157 2157 2162 2172 2181 2182 2184 2187 2187 2187 2189 2190 2227 2255 2258 2258 2269 2270 2270 2275 2283 2284 2285 2285 2286 2295 2300 2302 2304 2310 2312 2315 2318 2324 2343 2371 2371 2373 2384 2386 2387 2398 2399 2400 2411 2412 2413 2414 2417 2423 2426 2431 2431 2432 2443 2455 2456 2484 2485 2488 2500 2502 2510 2513 2515 2515 2526 2527 2529 2540 2540 2542 2545 2545 2554 2557 2584 2586 2587 2598 2615 2616 2623 2628 2629 2641 2641 2642 2643 2643 2644 2654 2655 2657 2658 2658 2671 2685 2699 2699 2701 2729 2729 2754 2754 2754 2756 2757 2757 2757 2757 2770 2771 2771 2772 2772 2772 2772 2782 2784 2788 2814 2885".split()
spec = map(int, spec)
# This is REALLY slow...
x = leaderboardsequencing(spec, 26)
print "-".join(map(str, x))
Leader length: 1 Leader expanded: 18 After cut: 18 Top: 71 Score: 2 ================ Leader length: 18 Leader expanded: 324 After cut: 58 Top: 71-113 Score: 4 ==================== Leader length: 58 Leader expanded: 1044 After cut: 87 Top: 71-113-115 Score: 8 ======================== Leader length: 87 Leader expanded: 1566 After cut: 65 Top: 113-114-156-131 Score: 14 ============================== Leader length: 65 Leader expanded: 1170 After cut: 32 Top: 113-114-156-131-71 Score: 20 ================================= Leader length: 32 Leader expanded: 576 After cut: 35 Top: 156-71-113-114-131-113 Score: 27 ===================================== Leader length: 35 Leader expanded: 630 After cut: 44 Top: 156-131-113-114-113-71-156 Score: 33 ========================================= Leader length: 44 Leader expanded: 792 After cut: 50 Top: 113-128-129-101-156-71-156-131 Score: 44 ============================================= Leader length: 50 Leader expanded: 900 After cut: 28 Top: 114-113-156-131-71-129-128-113-131 Score: 53 ================================================= Leader length: 28 Leader expanded: 504 After cut: 29 Top: 114-113-156-131-71-129-128-113-131-113 Score: 62 ===================================================== Leader length: 29 Leader expanded: 522 After cut: 31 Top: 156-71-113-114-131-156-113-101-129-131-113 Score: 75 ========================================================= Leader length: 31 Leader expanded: 558 After cut: 34 Top: 156-71-113-114-131-113-156-101-129-101-128-113 Score: 92 ============================================================= Leader length: 34 Leader expanded: 612 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128 Score: 103 ================================================================== Leader length: 30 Leader expanded: 540 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-113 Score: 116 ====================================================================== Leader length: 30 Leader expanded: 540 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-103-131-129 Score: 123 ========================================================================== Leader length: 28 Leader expanded: 504 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-103-128-97-131 Score: 144 ============================================================================= Leader length: 27 Leader expanded: 486 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-128 Score: 164 ================================================================================= Leader length: 28 Leader expanded: 504 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113 Score: 202 ===================================================================================== Leader length: 27 Leader expanded: 486 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-128-113 Score: 212 ========================================================================================= Leader length: 30 Leader expanded: 540 After cut: 29 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-128-113 Score: 224 ============================================================================================= Leader length: 29 Leader expanded: 522 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-129-128-113 Score: 253 ================================================================================================= Leader length: 27 Leader expanded: 486 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-129-128-113 Score: 253 ================================================================================================= Leader length: 28 Leader expanded: 504 After cut: 29 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-113 Score: 301 ========================================================================================================= Leader length: 29 Leader expanded: 522 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113 Score: 435 ============================================================================================================= Leader length: 27 Leader expanded: 486 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113 Score: 435 ============================================================================================================= 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113
spec ="0 97 99 113 114 115 128 128 147 147 163 186 227 241 242 244 244 256 260 261 262 283 291 309 330 333 340 347 385 388 389 390 390 405 435 447 485 487 503 504 518 544 552 575 577 584 599 608 631 632 650 651 653 672 690 691 717 738 745 770 779 804 818 819 827 835 837 875 892 892 917 932 932 933 934 965 982 989 1039 1060 1062 1078 1080 1081 1095 1136 1159 1175 1175 1194 1194 1208 1209 1223 1322".split()
spec = map(int, spec)
x, l = leaderboardsequencing(spec, 1000)
print "-".join(map(str, x))
Leader length: 1 Leader expanded: 18 After cut: 18 Top: 97 Score: 2 ================= Leader length: 18 Leader expanded: 324 After cut: 324 Top: 97-147 Score: 4 ===================== Leader length: 324 Leader expanded: 5832 After cut: 1732 Top: 99-128-163 Score: 8 ========================= Leader length: 1732 Leader expanded: 31176 After cut: 2001 Top: 99-163-128-128 Score: 12 ============================== Leader length: 2001 Leader expanded: 36018 After cut: 1748 Top: 128-99-163-128-113 Score: 16 ================================== Leader length: 1748 Leader expanded: 31464 After cut: 1250 Top: 99-163-128-114-147-186 Score: 23 ====================================== Leader length: 1250 Leader expanded: 22500 After cut: 1306 Top: 128-99-163-128-114-147-113 Score: 33 ========================================== Leader length: 1306 Leader expanded: 23508 After cut: 1124 Top: 114-128-163-99-128-113-147-186 Score: 38 ============================================== Leader length: 1124 Leader expanded: 20232 After cut: 1213 Top: 113-128-99-163-128-114-147-186-97 Score: 48 ================================================= Leader length: 1213 Leader expanded: 21834 After cut: 1029 Top: 113-128-99-163-128-114-147-186-97-147 Score: 82 ===================================================== Leader length: 975 Leader expanded: 17550 After cut: 1009 Top: 113-128-99-163-128-114-147-71-115-97-147 Score: 83 ======================================================== Leader length: 128 Leader expanded: 2304 After cut: 1004 Top: 113-128-99-163-128-114-147-71-115-97-147 Score: 83 ======================================================== Leader length: 2 Leader expanded: 36 After cut: 36 Top: 113-128-99-163-128-114-147-71-115-97-147 Score: 83 ======================================================== 113-128-99-163-128-114-147-71-115-97-147
So the problem with the previous solution (and what makes it painfully slow) is that it calculates the score of P+A (relative to Spec) for every single aminoacid mass A and peptide P without using the fact that we have already calculate the score of P. What follows is an attempt of fixing that.
def expandPeptide(L, p, m, sp_f):
sp = dict(L[p][0])
sc = L[p][2]
ma = L[p][3]
p_list = L[p][1]
ma += m
if ma <= max(sp_f):
if p == '':
sp[m] = sp.get(m,0) + 1
if sp[m] <= sp_f.get(m,0):
sc += 1
else:
sc = modifySpectrum(p_list, m, sp, sc, sp_f)
newkey = newPeptideKey(p,m)
L[newkey] = (sp, p_list + [m], sc, ma)
def modifySpectrum(p_list, m, sp, sc, sp_f):
output = sc
coming = [sum(s) for s in extension_newsubs(p_list,m)]
for m in coming:
sp[m] = sp.get(m,0) + 1
if sp[m] <= sp_f.get(m,0):
output += 1
#going = [sum(s) for s in extension_outdatedsubs(p_list)]
#for m in going:
# sp[m] -= 1
# if sp[m] < sp_f.get(m,0):
# output -= 1
return output
def specFreq(spec):
output = {}
for m in spec:
output[m] = output.get(m,0) + 1
return output
def newPeptideKey(p,a):
if p == '':
return str(a)
else:
return p + '-' + str(a)
def extension_newsubs(oldpep, a):
pivot = len(oldpep)
newpep = oldpep + [a]
newpep_extended = newpep + newpep[:pivot - 1]
output = [[a], newpep]
for i in range(pivot-1):
for j in range(i+2):
output += [newpep_extended[pivot-(i+1)+j:pivot+1+j]]
return output
def extension_outdatedsubs(oldpep):
pivot = len(oldpep) - 1
oldpep_extended = oldpep + oldpep[:pivot -1]
output = []
for i in range(pivot-1):
for j in range(i + 1):
output += [oldpep_extended[pivot-i+j:pivot+1+j+1]]
return output
def expandLeaderboard(L, sp_f):
current = list(L.keys())
for p in current:
preparePep(L, p, sp_f)
for m in aminoacid_masses:
expandPeptide(L, p, m, sp_f)
del L[p]
def preparePep(L, p, sp_f):
sp = dict(L[p][0])
p_list = L[p][1]
sc = L[p][2]
ma = L[p][3]
going = [sum(s) for s in extension_outdatedsubs(p_list)]
for m in going:
sp[m] -= 1
if sp[m] < sp_f.get(m,0):
sc -= 1
L[p] = (sp, p_list, sc, ma)
def newleaderboardsequencing(spec, N):
LeaderBoard = {}
LeaderBoard[''] = ({0:1}, [], 1, 0)
leaderpeptide = ''
leaderscore = 1
pm = parentmass(spec)
spec_frec = specFreq(spec)
while LeaderBoard:
print 'Leader length: %s' % len(LeaderBoard)
#print 'Leader expanded: %s' % len(LeaderBoard)
top_scores = sorted([LeaderBoard[p][2] for p in LeaderBoard], reverse=True)
if len(top_scores) > N:
LeaderBoard = {p:v for p,v in LeaderBoard.items() if v[2] >= top_scores[N]}
#print 'After cut: %s' % len(LeaderBoard)
top = top_scores[0]
if top > leaderscore:
top_scorers = [p for p in LeaderBoard if LeaderBoard[p][2] == top]
leaderpeptide = top_scorers[0]
leaderscore = top
st = 'Top: %s \nScore: %s' % (leaderpeptide, leaderscore)
m = len(st)
print st
print "="*m
#LeaderBoard = {p:v for p,v in LeaderBoard.items() if v[3] <= pm}
expandLeaderboard(LeaderBoard, spec_frec)
return leaderpeptide
spl = "0 71 113 129 147 200 218 260 313 331 347 389 460".split()
spec = map(int, spl)
spec_freq = specFreq(spec)
x = newleaderboardsequencing(spec, 10)
print x
Leader length: 1 After cut: 1 Top: Score: 1 =============== Leader length: 18 After cut: 18 Top: 147 Score: 2 ================== Leader length: 324 After cut: 16 Top: 71-129 Score: 4 ===================== Leader length: 286 After cut: 12 Top: 113-71-147 Score: 7 ========================= Leader length: 120 After cut: 12 Top: 147-71-129-113 Score: 13 ============================== 147-71-129-113
spec = "0 71 97 101 103 113 113 113 113 114 114 115 128 128 128 128 129 131 131 131 156 156 184 186 186 200 214 227 227 228 230 231 241 242 242 243 244 244 256 257 262 269 270 287 298 299 301 328 331 340 340 343 345 345 356 358 359 370 370 372 375 383 385 397 400 401 429 430 442 453 454 454 459 462 468 471 472 473 474 485 486 487 498 499 501 512 514 514 542 561 567 570 573 575 581 583 585 590 599 600 600 601 602 610 615 615 616 627 627 630 658 695 696 698 698 698 701 703 704 713 723 728 728 728 728 730 730 731 741 744 747 758 761 769 799 810 817 827 829 831 832 841 841 844 844 851 854 854 857 859 862 872 882 884 886 889 928 928 944 945 947 955 955 958 959 960 966 967 972 972 982 985 990 996 997 1000 1000 1003 1041 1056 1059 1062 1068 1068 1068 1073 1075 1075 1084 1087 1089 1095 1097 1103 1113 1114 1128 1128 1131 1152 1172 1172 1181 1182 1184 1189 1190 1190 1196 1197 1199 1200 1202 1210 1212 1227 1231 1242 1259 1259 1283 1295 1298 1303 1303 1303 1303 1304 1311 1312 1317 1318 1325 1325 1328 1330 1338 1340 1345 1355 1356 1388 1396 1416 1426 1426 1427 1431 1432 1432 1434 1440 1442 1443 1445 1451 1453 1453 1454 1458 1459 1459 1469 1489 1497 1529 1530 1540 1545 1547 1555 1557 1560 1560 1567 1568 1573 1574 1581 1582 1582 1582 1582 1587 1590 1602 1626 1626 1643 1654 1658 1673 1675 1683 1685 1686 1688 1689 1695 1695 1695 1696 1701 1703 1704 1713 1713 1733 1754 1757 1757 1771 1772 1782 1788 1790 1796 1798 1801 1810 1810 1812 1817 1817 1817 1823 1826 1829 1844 1882 1885 1885 1888 1889 1895 1900 1903 1913 1913 1918 1919 1925 1926 1927 1930 1930 1938 1940 1941 1957 1957 1996 1999 2001 2003 2013 2023 2026 2028 2031 2031 2034 2041 2041 2044 2044 2053 2054 2056 2058 2068 2075 2086 2116 2124 2127 2138 2141 2144 2154 2155 2155 2157 2157 2157 2157 2162 2172 2181 2182 2184 2187 2187 2187 2189 2190 2227 2255 2258 2258 2269 2270 2270 2275 2283 2284 2285 2285 2286 2295 2300 2302 2304 2310 2312 2315 2318 2324 2343 2371 2371 2373 2384 2386 2387 2398 2399 2400 2411 2412 2413 2414 2417 2423 2426 2431 2431 2432 2443 2455 2456 2484 2485 2488 2500 2502 2510 2513 2515 2515 2526 2527 2529 2540 2540 2542 2545 2545 2554 2557 2584 2586 2587 2598 2615 2616 2623 2628 2629 2641 2641 2642 2643 2643 2644 2654 2655 2657 2658 2658 2671 2685 2699 2699 2701 2729 2729 2754 2754 2754 2756 2757 2757 2757 2757 2770 2771 2771 2772 2772 2772 2772 2782 2784 2788 2814 2885".split()
spec = map(int, spec)
x = newleaderboardsequencing(spec, 26)
print x
Leader length: 1 After cut: 1 Top: Score: 1 =============== Leader length: 18 After cut: 18 Top: 115 Score: 2 ================== Leader length: 324 After cut: 58 Top: 114-114 Score: 4 ====================== Leader length: 1044 After cut: 87 Top: 113-115-128 Score: 8 ========================== Leader length: 1566 After cut: 65 Top: 156-114-113-131 Score: 14 =============================== Leader length: 1170 After cut: 32 Top: 113-71-156-131-114 Score: 20 ================================== Leader length: 576 After cut: 35 Top: 156-71-113-114-131-113 Score: 27 ====================================== Leader length: 630 After cut: 44 Top: 156-131-113-114-113-71-156 Score: 33 ========================================== Leader length: 792 After cut: 50 Top: 113-128-129-101-156-71-156-131 Score: 44 ============================================== Leader length: 900 After cut: 28 Top: 114-113-156-131-71-129-128-113-131 Score: 53 ================================================== Leader length: 504 After cut: 29 Top: 114-113-156-131-71-129-128-113-131-113 Score: 62 ====================================================== Leader length: 522 After cut: 31 Top: 156-71-113-114-131-156-113-101-129-131-113 Score: 75 ========================================================== Leader length: 558 After cut: 34 Top: 156-71-113-114-131-113-156-101-129-101-128-113 Score: 92 ============================================================== Leader length: 612 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128 Score: 103 =================================================================== Leader length: 540 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-113 Score: 116 ======================================================================= Leader length: 540 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-103-131-129 Score: 123 =========================================================================== Leader length: 504 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-103-128-97-131 Score: 144 ============================================================================== Leader length: 486 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131 Score: 164 ================================================================================== Leader length: 504 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113 Score: 202 ====================================================================================== Leader length: 486 After cut: 30 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-128-113 Score: 212 ========================================================================================== Leader length: 540 After cut: 29 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-128-113 Score: 224 ============================================================================================== Leader length: 522 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-129-128-113 Score: 253 ================================================================================================== Leader length: 486 After cut: 28 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-129-128-113 Score: 253 ================================================================================================== Leader length: 504 After cut: 29 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-113 Score: 301 ========================================================================================================== Leader length: 299 After cut: 27 Top: 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113 Score: 435 ============================================================================================================== 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113
Now this is much faster!
But not quite... This one still goes beyond the 5 min limit:
spec = "0 71 71 87 87 97 99 101 101 113 113 114 114 114 115 115 128 129 129 131 137 147 147 156 158 168 172 184 198 200 202 213 215 216 227 230 234 242 242 243 246 259 260 268 269 270 275 281 284 285 286 297 305 314 314 317 331 331 355 356 356 361 368 371 372 373 382 383 385 388 389 399 399 406 411 415 415 428 442 448 459 470 472 481 482 484 486 486 487 498 502 502 502 512 512 520 528 529 530 543 556 573 573 573 577 583 583 585 595 598 601 602 615 615 617 619 626 628 631 641 645 654 657 657 670 670 674 686 688 696 714 716 720 728 730 732 732 733 741 742 744 745 754 755 756 756 767 771 783 788 789 803 815 817 819 842 843 843 845 847 854 857 858 861 870 870 870 883 884 887 888 890 901 903 904 912 914 929 948 954 957 958 970 971 976 983 984 984 988 991 1001 1001 1002 1013 1014 1017 1018 1019 1025 1026 1030 1042 1072 1085 1085 1085 1085 1089 1097 1103 1104 1105 1112 1113 1117 1117 1120 1127 1129 1131 1138 1143 1148 1156 1156 1175 1182 1198 1200 1200 1203 1204 1214 1218 1218 1226 1226 1226 1230 1232 1234 1243 1260 1269 1269 1271 1276 1276 1283 1285 1289 1301 1305 1311 1315 1327 1331 1333 1340 1340 1345 1347 1347 1356 1373 1382 1384 1386 1390 1390 1390 1398 1398 1402 1412 1413 1416 1416 1418 1434 1441 1460 1460 1468 1473 1478 1485 1487 1489 1496 1499 1499 1503 1504 1511 1512 1513 1519 1527 1531 1531 1531 1531 1544 1574 1586 1590 1591 1597 1598 1599 1602 1603 1614 1615 1615 1625 1628 1632 1632 1633 1640 1645 1646 1658 1659 1662 1668 1687 1702 1704 1712 1713 1715 1726 1728 1729 1732 1733 1746 1746 1746 1755 1758 1759 1762 1769 1771 1773 1773 1774 1797 1799 1801 1813 1827 1828 1833 1845 1849 1860 1860 1861 1862 1871 1872 1874 1875 1883 1884 1884 1886 1888 1896 1900 1902 1920 1928 1930 1942 1946 1946 1959 1959 1962 1971 1975 1985 1988 1990 1997 1999 2001 2001 2014 2015 2018 2021 2031 2033 2033 2039 2043 2043 2043 2060 2073 2086 2087 2088 2096 2104 2104 2114 2114 2114 2118 2129 2130 2130 2132 2134 2135 2144 2146 2157 2168 2170 2174 2188 2201 2201 2205 2210 2217 2217 2227 2228 2231 2233 2234 2243 2244 2245 2248 2255 2260 2260 2261 2285 2285 2299 2302 2302 2311 2319 2330 2331 2332 2335 2341 2346 2347 2348 2356 2357 2370 2373 2374 2374 2382 2386 2389 2400 2401 2403 2414 2416 2418 2432 2444 2448 2458 2460 2469 2469 2479 2485 2487 2487 2488 2501 2501 2502 2502 2502 2503 2503 2515 2515 2517 2519 2529 2529 2545 2545 2616".split()
spec = map(int, spec)
N = 448
x = newleaderboardsequencing(spec, N)
print x
Leader length: 1 After cut: 1 Top: Score: 1 =============== Leader length: 18 After cut: 18 Top: 115 Score: 2 ================== Leader length: 324 After cut: 324 Top: 128-131 Score: 4 ====================== Leader length: 5832 After cut: 1254 Top: 129-156-71 Score: 8 ========================= Leader length: 22572 After cut: 1452 Top: 71-87-128-156 Score: 14 ============================= Leader length: 26136 After cut: 483 Top: 156-71-129-131-128 Score: 21 ================================== Leader length: 8694 After cut: 845 Top: 87-115-115-131-137-147 Score: 29 ====================================== Leader length: 15210 After cut: 507 Top: 101-71-87-114-113-87-97 Score: 35 ======================================= Leader length: 9126 After cut: 666 Top: 114-99-101-97-71-113-147-128 Score: 46 ============================================ Leader length: 11988 After cut: 624 Top: 113-147-128-114-113-129-114-156-71 Score: 58 ================================================== Leader length: 11232 After cut: 609 Top: 87-147-137-131-115-115-87-129-156-114 Score: 68 ===================================================== Leader length: 10962 After cut: 452 Top: 114-99-101-97-71-113-147-128-114-113-129 Score: 83 ======================================================== Leader length: 8136 After cut: 536 Top: 87-115-115-131-137-147-87-71-101-114-99-186 Score: 98 =========================================================== Leader length: 9648 After cut: 526 Top: 87-147-137-131-115-115-87-129-156-114-129-113-71 Score: 112 ================================================================= Leader length: 9468 After cut: 468 Top: 137-131-115-115-87-129-156-114-129-113-114-128-131-147 Score: 134 ======================================================================= Leader length: 8424 After cut: 479 Top: 114-113-129-114-156-129-87-115-115-131-137-147-87-71-101 Score: 153 ========================================================================= Leader length: 8622 After cut: 467 Top: 87-147-137-131-115-115-87-129-156-114-129-113-114-128-186-71 Score: 161 ============================================================================= Leader length: 8406 After cut: 469 Top: 113-129-114-156-129-87-115-115-131-137-147-87-71-101-114-99-156 Score: 191 ================================================================================ Leader length: 8442 After cut: 480 Top: 115-131-137-147-87-71-101-114-99-101-97-71-113-147-128-114-113-115 Score: 214 =================================================================================== Leader length: 8640 After cut: 518 Top: 114-99-101-97-71-113-147-128-114-113-129-114-156-129-87-115-115-131-128 Score: 229 ======================================================================================== Leader length: 9324 After cut: 509 Top: 113-129-114-156-129-87-115-115-131-137-147-87-71-101-114-99-101-97-71-87 Score: 251 ========================================================================================= Leader length: 9153 After cut: 457 Top: 114-113-129-114-156-129-87-115-115-131-137-147-128-131-114-99-101-128-186-128-114 Score: 295 ================================================================================================== Leader length: 7690 After cut: 449 Top: 156-114-129-113-114-128-147-113-71-97-101-99-114-101-71-87-147-137-131-186-131-129 Score: 429 =================================================================================================== Leader length: 3178 After cut: 452 Top: 87-115-115-131-137-147-87-71-101-114-99-101-97-71-113-147-128-114-113-129-114-156-129 Score: 507 ====================================================================================================== Leader length: 171 After cut: 171 Top: 87-115-115-131-137-147-87-71-101-114-99-101-97-71-113-147-128-114-113-129-114-156-129 Score: 507 ====================================================================================================== Leader length: 9 After cut: 9 Top: 87-115-115-131-137-147-87-71-101-114-99-101-97-71-113-147-128-114-113-129-114-156-129 Score: 507 ====================================================================================================== 87-115-115-131-137-147-87-71-101-114-99-101-97-71-113-147-128-114-113-129-114-156-129