In [1]:
def neighbors1mm(kmer, alpha):
''' Generate all neighbors at Hamming distance 1 from kmer '''
neighbors = []
for j in range(len(kmer)-1, -1, -1):
oldc = kmer[j]
for c in alpha:
if c == oldc: continue
neighbors.append(kmer[:j] + c + kmer[j+1:])
return neighbors

In [2]:
neighbors1mm('CAT', 'ACGT')

Out[2]:
['CAA', 'CAC', 'CAG', 'CCT', 'CGT', 'CTT', 'AAT', 'GAT', 'TAT']
In [3]:
def kmerHist(reads, k):
''' Return k-mer histogram and average # k-mer occurrences '''
kmerhist = {}
kmerhist[kmer] = kmerhist.get(kmer, 0) + 1
return kmerhist

In [4]:
khist = kmerHist(['CAT' * 10], 3)
khist

Out[4]:
{'ATC': 9, 'CAT': 10, 'TCA': 9}
In [5]:
def correct1mm(read, k, kmerhist, alpha, thresh):
''' Return an error-corrected version of read.  k = k-mer length.
kmerhist is kmer count map.  alpha is alphabet.  thresh is
count threshold above which k-mer is considered correct. '''
# Iterate over k-mers in read
# If k-mer is infrequent...
if kmerhist.get(kmer, 0) <= thresh:
# Look for a frequent neighbor
for newkmer in neighbors1mm(kmer, alpha):
if kmerhist.get(newkmer, 0) > thresh:
# Found a frequent neighbor; replace old kmer
# with neighbor
break

In [6]:
correct1mm('CAT', 3, khist, 'ACGT', 2)

Out[6]:
'CAT'
In [7]:
correct1mm('CTT', 3, khist, 'ACGT', 2)

Out[7]:
'CAT'