import timeit
import re
import random
# build a test data set:
strings_to_match = ['%04dmagicword' % i for i in xrange(3000)]
print strings_to_match[:10]
print len(strings_to_match)
['0000magicword', '0001magicword', '0002magicword', '0003magicword', '0004magicword', '0005magicword', '0006magicword', '0007magicword', '0008magicword', '0009magicword'] 3000
# build a test "file"
my_file = []
with open('/usr/share/dict/words') as fh:
random.seed(42)
for line in fh:
split_point = random.randint(0, len(line))
my_file.append(line[:split_point] + random.choice(strings_to_match) + line[split_point:])
my_file = my_file[:1000000]
print my_file[:10]
print len(my_file)
['a0075magicword\n', '0669magicwordA\n', 'aa2030magicword\n', 'AA\n0260magicword', 'aa0089magicworda\n', 'A1516magicwordachen\n', '0596magicwordAachenem\n', 'Aachen1634magicwordie\n', 'Aa1767magicwordchenowi\n', 'Aachenu0019magicword\n'] 1000000
def naive_match():
for line in my_file:
if not any(s in line for s in strings_to_match):
return False
return True
%%timeit
naive_match()
1 loops, best of 3: 1min 46s per loop
def re_match():
# building regular expression to match
expression = re.compile(
'(' +
'|'.join(re.escape(item) for item in strings_to_match) +
')')
# perform matching
for line in my_file:
if not expression.search(line):
return False
return True
%%timeit
re_match()
1 loops, best of 3: 9.97 s per loop