### Approximate matching with pigeonhole principle and Boyer-Moore¶

Most of this notebook is the Boyer-Moore implementation, as we saw in a previous notebook. Scroll to the bottom for the bmApproximate function that uses it to do approximate matching with the pigeonhole principle.

In :
import (
"bytes"
"sort"
)

In :
func zArray(s string) []int {
// Use Z-algorithm to preprocess given string.  See
// Gusfield for complete description of algorithm.
Z := make([]int, len(s)+1)
Z = len(s)

// Initial comparison of s[1:] with prefix
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
Z += 1
} else {
break
}
}

r, l := 0, 0
if Z > 0 {
r, l = Z, 1
}

for k := 2; k < len(s); k++ {
if k > r {
// Case 1
for i := k; i < len(s); i++ {
if s[i] == s[i-k] {
Z[k] += 1
} else {
break
}
}
r, l = k + Z[k] - 1, k
} else {
// Case 2
// Calculate length of beta
nbeta := r - k + 1
Zkp := Z[k - l]
if nbeta > Zkp {
// Case 2a: Zkp wins
Z[k] = Zkp
} else {
// Case 2b: Compare characters just past r
nmatch := 0
for i := r+1; i < len(s); i++ {
if s[i] == s[i - k] {
nmatch += 1
} else {
break
}
}
l, r = k, r + nmatch
Z[k] = r - k + 1
}
}
}
return Z
}

In :
// Helper function that returns a new string
// that is the reverse of the argument
func reverseString(s string) string {
var buf bytes.Buffer
for i := 0; i < len(s); i++ {
buf.WriteByte(s[len(s)-i-1])
}
return buf.String()
}

// Helper function that returns a new int slice
// that is the reverse of the argument
func reverseInts(is []int) []int {
buf := make([]int, len(is))
for i := 0; i < len(is); i++ {
buf[len(is)-i-1] = is[i]
}
return buf
}

In :
// Make N array (Gusfield theorem 2.2.2) from Z array
func nArray(s string) []int {
return reverseInts(zArray(reverseString(s)))
}

// Make L' array (Gusfield theorem 2.2.2) using p and N array.
// L'[i] = largest index j less than n such that N[j] = |P[i:]|
func bigLPrimeArray(length int, n []int) []int {
lp := make([]int, length)
for j := 0; j < length-1; j++ {
i := length - n[j]
if i < length {
lp[i] = j + 1
}
}
return lp
}

// Compile L array (Gusfield theorem 2.2.2) using p and L' array.
// L[i] = largest index j less than n such that N[j] >= |P[i:]|
func bigLArray(length int, lp []int) []int {
l := make([]int, length)
l = lp
for i := 2; i < length; i++ {
l[i] = l[i-1]
if lp[i] > l[i] {
l[i] = lp[i]
}
}
return l
}

// Compile lp' array (Gusfield theorem 2.2.4) using N array
func smallLPrimeArray(n []int) []int {
smallLps := make([]int, len(n))
for i := 0; i < len(n); i++ {
if n[i] == i+1 {  // prefix matching a suffix
smallLps[len(n)-i-1] = i+1
}
}
for i := len(n)-2; i > -1; i-- {  // "smear" them out to the left
if smallLps[i] == 0 {
smallLps[i] = smallLps[i+1]
}
}
return smallLps
}

In :
// Return tables needed to apply good suffix rule
func goodSuffixTable(p string) ([]int, []int) {
n := nArray(p)
lp := bigLPrimeArray(len(p), n)
return bigLArray(len(p), lp), smallLPrimeArray(n)
}

// Given pattern string and list with ordered alphabet characters, create
// and return a dense bad character table.  Table is indexed by offset
// then by character.
func denseBadCharTab(p string, amap map[byte]int) [][]int {
tab := make([][]int, len(p))
nxt := make([]int, len(amap))
for i, c := range []byte(p) {
tab[i] = nxt[:]
nxt[amap[c]] = i+1
}
return tab
}

In :
// Boyer-Moore preprocessing object
type BoyerMoore struct {
pattern             string
amap                map[byte]int
bigLS, smallLPrimes []int
}

// Constructor for Boyer-Moore preprocessing object
func NewBoyerMoore(p string, alphabet string) *BoyerMoore {
m := new(BoyerMoore)
m.pattern = p
m.amap = make(map[byte]int)
for i, c := range []byte(alphabet) {
m.amap[c] = i
}
m.bigLS, m.smallLPrimes = goodSuffixTable(p)
return m
}

In :
// Return # skips given by bad character rule at offset i
func (bm *BoyerMoore) badCharacterRule(i int, c byte) int {
ci := bm.amap[c]
}

// Given a mismatch at offset i, return amount to shift
// as determined by (weak) good suffix rule.
func (bm *BoyerMoore) goodSuffixRule(i int) int {
length := len(bm.bigLS)
if i == length - 1 {
return 0
}
i += 1   // i points to leftmost matching position of P
if bm.bigLS[i] > 0 {
return length - bm.bigLS[i]
}
return length - bm.smallLPrimes[i]
}

In :
// Return amount to shift in case where P matches T
func (bm *BoyerMoore) matchSkip() int {
return len(bm.smallLPrimes) - bm.smallLPrimes
}

// Utility function for taking max of integers
func Max(x, y int) int {
if x > y {
return x
}
return y
}

// Do Boyer-Moore matching
func (bm *BoyerMoore) match(t string) []int {
i := 0
occurrences := make([]int, 0)
p := bm.pattern
for i < len(t) - len(p) + 1 {
shift, skipGs := 1, 1
mismatched := false
for j := len(p) - 1; j >= 0; j-- {
if p[j] != t[i+j] {
skipGs = bm.goodSuffixRule(j)
shift = Max(shift, Max(skipBc, skipGs))
mismatched = true
break
}
}
if ! mismatched {
occurrences = append(occurrences, i)
skipGs := Max(shift, skipGs)
shift = Max(shift, skipGs)
}
i += shift
}
return occurrences
}


We start the approximate matching implementation with a partition function that splits the pattern up into equal-sized (or nearly equal-sized) pieces.

In :
// Return a string splice containing non-overlapping,
// non-empty substrings that cover p.  They should be
// as close to equal-length as possible.
func partition(p string, pieces int) []string {
base, mod := len(p) / pieces, len(p) % pieces
idx := 0
ps := make([]string, 0)
for i := 0; i < pieces; i++ {
if i >= mod {
}
newIdx := idx + base + modAdjust
ps = append(ps, p[idx:newIdx])
idx = newIdx
}
return ps
}

In :
// Use the pigeonhole principle together with Boyer-Moore to find
// approximate matches with up to a specified number of mismatches.
func bmApproximate(p string, t string, k int, alph string) []int {
ps := partition(p, k+1) // split p into k+1 non-empty, non-overlapping substrings
off := 0                // offset into p of current partition
occurrences := make(map[int]int) // we might see the same occurrence >1 time
for _, pi := range ps { // for each partition
bm := NewBoyerMoore(pi, alph)  // BM preprocess the partition
for _, hit := range bm.match(t) {
if hit - off < 0 {
continue  // pattern falls off left end of T?
}
if hit + len(p) - off > len(t) {
continue  // falls off right end?
}
// Count mismatches to left and right of the matching partition
nmm := 0
for i := 0; i < off; i++ {
if t[hit-off+i] != p[i] {
nmm++
if nmm > k {
break  // exceeded maximum # mismatches
}
}
}
if nmm <= k {
for i := off+len(pi); i < len(p); i++ {
if t[hit-off+i] != p[i] {
nmm++
if nmm > k {
break  // exceeded maximum # mismatches
}
}
}
}
if nmm <= k {
occurrences[hit-off]++  // approximate match
}
}
off += len(pi)  // Update offset of current partition
}
var occurrence_list []int
for k := range occurrences {
occurrence_list = append(occurrence_list, k)
}
sort.Ints(occurrence_list)
return occurrence_list
}

In :
bmApproximate("needle", "needle noodle nargle", 2, "abcdefghijklmnopqrstuvwxyz ")

Out:
[0 7]