#!/usr/bin/env python # coding: utf-8 # In[1]: # Let's make a class that implements an inverted (substring) index where the map data # structure is a sorted list of (substring, offset) pairs. Query w/ binary search. import sys import bisect # for binary search class IndexSorted(object): def __init__(self, t, ln, ival): """ Create index, extracting substrings of length 'ln' """ self.t = t self.ln = ln self.ival = ival self.index = [] for i in range(len(t)-ln+1): self.index.append((t[i:i+ln], i)) # add pair self.index.sort() # sort pairs def query(self, p): """ Return candidate alignments for p """ st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxsize)) # binary search hits = self.index[st:en] # this range of elements corresponds to the hits return [ h[1] for h in hits ] # return just the offsets # In[2]: index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4) # In[3]: index.query('CTTACTTA') # index: give us hints on where to look for this pattern # In[4]: index.query('TTTTTTTT') # index: give us hints on where to look for this pattern # In[5]: # Now let's make a similar class but using a Python dictionary instead of a sorted # list. A Python dictionary is basically a hash table. class IndexHash(object): def __init__(self, t, ln, ival): """ Create index, extracting substrings of length 'ln' """ self.t = t self.ln = ln self.ival = ival self.index = {} for i in range(len(t)-ln+1): substr = t[i:i+ln] if substr in self.index: self.index[substr].append(i) # substring already in dictionary else: self.index[substr] = [i] # add to dictionary def query(self, p): """ Return candidate alignments for p """ return self.index.get(p[:self.ln], [])[:] # In[6]: index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4) # In[7]: index2.query('CTTACTTA') # index: give us hints on where to look for this pattern # In[8]: index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern