#!/usr/bin/env python # coding: utf-8 # ## Suffix tree # Note that this suffix-tree implementation stores string labels for the edges. In reality, to achieve the O(m) space bound, we store (offset, length) pairs instead, where a pair refers to the substring of T labeling the edge. We use substrings here to keep the code simpler. # In[1]: class SuffixTree(object): class Node(object): def __init__(self, lab): self.lab = lab # label on path leading to this node self.out = {} # outgoing edges; maps characters to nodes def __init__(self, s): """ Make suffix tree, without suffix links, from s in quadratic time and linear space """ s += '$' self.root = self.Node(None) self.root.out[s[0]] = self.Node(s) # trie for just longest suf # add the rest of the suffixes, from longest to shortest for i in range(1, len(s)): # start at root; we’ll walk down as far as we can go cur = self.root j = i while j < len(s): if s[j] in cur.out: child = cur.out[s[j]] lab = child.lab # Walk along edge until we exhaust edge label or # until we mismatch k = j+1 while k-j < len(lab) and s[k] == lab[k-j]: k += 1 if k-j == len(lab): cur = child # we exhausted the edge j = k else: # we fell off in middle of edge cExist, cNew = lab[k-j], s[k] # create “mid”: new node bisecting edge mid = self.Node(lab[:k-j]) mid.out[cNew] = self.Node(s[k:]) # original child becomes mid’s child mid.out[cExist] = child # original child’s label is curtailed child.lab = lab[k-j:] # mid becomes new child of original parent cur.out[s[j]] = mid else: # Fell off tree at a node: make new edge hanging off it cur.out[s[j]] = self.Node(s[j:]) def followPath(self, s): """ Follow path given by s. If we fall off tree, return None. If we finish mid-edge, return (node, offset) where 'node' is child and 'offset' is label offset. If we finish on a node, return (node, None). """ cur = self.root i = 0 while i < len(s): c = s[i] if c not in cur.out: return (None, None) # fell off at a node child = cur.out[s[i]] lab = child.lab j = i+1 while j-i < len(lab) and j < len(s) and s[j] == lab[j-i]: j += 1 if j-i == len(lab): cur = child # exhausted edge i = j elif j == len(s): return (child, j-i) # exhausted query string in middle of edge else: return (None, None) # fell off in the middle of the edge return (cur, None) # exhausted query string at internal node def hasSubstring(self, s): """ Return true iff s appears as a substring """ node, off = self.followPath(s) return node is not None def hasSuffix(self, s): """ Return true iff s is a suffix """ node, off = self.followPath(s) if node is None: return False # fell off the tree if off is None: # finished on top of a node return '$' in node.out else: # finished at offset 'off' within an edge leading to 'node' return node.lab[off] == '$' # In[2]: stree = SuffixTree('there would have been a time for such a word') # In[3]: stree.hasSubstring('nope') # In[4]: stree.hasSubstring('would have been') # In[5]: stree.hasSubstring('such a word') # In[6]: stree.hasSuffix('would have been') # In[7]: stree.hasSuffix('such a word')