In [1]:
# A simple example of a substring index; mirrors example from lecture notes

# we're going to extract 4 substrings like this:
# t:           CGTGCCTACTTACTTACAT
# substring 1: CGTGC
# substring 2:     CCTAC
# substring 3:         CTTAC
# substring 4:             CTTAC
t = 'CGTGCCTACTTACTTACAT'

In [2]:
# From t, make list of pairs, where first pair item is substring, second is its offset
def substringize(t, ln, iv):
# ln = length of substrings to extract
# iv = distance between substings to extract; e.g. 1 means take *every* substring
pairs = []
# Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)
for i in range(0, len(t) - ln + 1, iv):
pairs.append((t[i:i+ln], i))
return pairs
# Could also have used list comprehension:
# return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]

In [3]:
substringize('CGTGCCTACTTACTTACAT', 5, 4)

Out[3]:
[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]
In [4]:
# Like substringize, but uses a map data structure
def mapize(t, ln, iv):
index = {}
for i in range(0, len(t) - ln + 1, iv):
sub = t[i:i+ln]
if sub in index:
index[sub].append(i) # already have an entry; append to it
else:
index[sub] = [i] # don't yet have entry, make new one
return index

In [5]:
index = mapize('CGTGCCTACTTACTTACAT', 5, 4)
index

Out[5]:
{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}
In [6]:
p = 'CTTACTTA'

In [7]:
# index: give me a hint where I should look for occurrences of p in t
index[p[:5]]

Out[7]:
[8, 12]