dna_string = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC' counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0, } for c in dna_string: counts[c] += 1 count_strings = [str(counts[base]) for base in 'ACGT'] print ' '.join(count_strings) dna_string = 'GATGGAACTTGACTACGTAAAT' for i, c in enumerate(dna_string): if c == 'T': dna_string[i] = 'U' rna_string = '' for c in dna_string: if c == 'T': rna_string += 'U' else: rna_string += c print rna_string rna_string_2 = dna_string.replace('T', 'U') print rna_string == rna_string_2 dna_string = 'AAAACCCGGT' complement = {'T': 'A', 'C': 'G', 'A': 'T', 'G': 'C', } rc_string = '' for c in reversed(dna_string): rc_string += complement[c] print rc_string import string complement_table = string.maketrans('TCAG', 'AGTC') c_string = dna_string.translate(complement_table) rc_string_2 = c_string[::-1] print rc_string == rc_string_2 up_to_ten = range(10) up_to_ten[2:4] up_to_ten[:4] up_to_ten[4:] up_to_ten[2:9:3] up_to_ten[9:2:-2] up_to_ten[::-1] numbers = range(4) letters = 'ABCD' for n, l in zip(numbers, letters): print n, l def hamming_distance(first, second): ''' Returns the number of positions that differ between first and second. ''' distance = sum(1 for f, s in zip(first, second) if f != s) return distance file_name = 'data/rosalind_hamm.txt' fh = open(file_name) first = fh.readline().strip() second = fh.readline().strip() print hamming_distance(first, second) import itertools how_many = 3 perms = itertools.permutations(range(1, how_many + 1)) print len(perms) list_perms = list(perms) print len(list_perms) for perm in list_perms: strings = [str(value) for value in perm] print ' '.join(strings) def find_locations(substring, string_to_search): ''' Returns a list of all positions in string where substring occurs. ''' locations = [] for i in range(len(string_to_search)): if string_to_search[i:i + len(substring)] == substring: locations.append(i) return locations file_name = 'data/rosalind_subs.txt' fh = open(file_name) string_to_search = fh.readline().strip() substring = fh.readline().strip() one_based = [str(l + 1) for l in find_locations(substring, string_to_search)] print ' '.join(one_based) import re pattern = substring for match in re.finditer(pattern, string_to_search): print match.start(), matches = re.finditer(pattern, string_to_search) re_one_based = [str(m.start() + 1) for m in matches] print re_one_based == one_based pattern = '(?={0})'.format(substring) matches = re.finditer(pattern, string_to_search) re_one_based_2 = [str(m.start() + 1) for m in matches] print re_one_based_2 == one_based from Bio.Seq import Seq file_name = 'data/rosalind_prot.txt' fh = open(file_name) dna_string = fh.readline().strip() dna_seq = Seq(dna_string) aa_seq = dna_seq.translate(to_stop=True) aa_string = str(aa_seq) print aa_string def fasta_records(file_name): ''' Returns a list of (name, seq) pairs for fasta records in file_name. ''' records = [] fh = open(file_name) name = fh.readline().strip().lstrip('>') while name: seq = '' line = fh.readline().strip() while line and not line.startswith('>'): seq += line line = fh.readline().strip() records.append((name, seq)) name = line.lstrip('>') return records file_name = 'data/rosalind_gc.txt' records = fasta_records(file_name) for name, seq in records: print name print seq print test_seq = 'GCATATATGCTAG' gcs = [char for char in test_seq if char == 'G' or char == 'C'] print gcs gc_count = sum(1 for char in test_seq if char == 'G' or char == 'C') print gc_count def gc_content(seq): ''' Returns the GC content of seq as a percentage. ''' gc_count = sum(1 for char in seq if char == 'G' or char == 'C') return 100 * float(gc_count) / len(seq) records_gc = [(name, gc_content(seq)) for name, seq in records] name, gc = max(records_gc, key=lambda (name, gc): gc) print name print gc import gc_content !cat gc_content.py gc_content.fasta_records.__doc__ def simple_generator(n): for i in range(n): yield i my_gen = simple_generator(4) print [number**2 for number in my_gen] import urllib opened_url = urllib.urlopen('http://www.uniprot.org/uniprot/A2Z669.fasta') name, seq = gc_content.fasta_records(opened_url).next() print name print seq import urllib motif = re.compile(r'(?=(N[^P][ST][^P]))') file_name = 'data/rosalind_mprt.txt' fh = open(file_name) for line in fh: uniprot_id = line.strip() url = 'http://www.uniprot.org/uniprot/{0}.fasta'.format(uniprot_id) opened_url = urllib.urlopen(url) for name, seq in gc_content.fasta_records(opened_url): ps = [str(match.start() + 1) for match in motif.finditer(seq)] if ps: print uniprot_id print ' '.join(ps)