from IPython import display
display.Image('https://mikecvet.files.wordpress.com/2010/07/mrfigure.png')
a = [1, 2, 3]
b = [4, 5, 6, 7]
c = [8, 9, 1, 2, 3]
L = map(lambda x:len(x), [a, b, c])
# L == [3, 4, 5]
N = reduce(lambda x, y: x+y, L)
# N == 12
# Or, if we want to be fancy and do it in one line
N = reduce(lambda x, y: x+y, map(lambda x:len(x), [a, b, c]))
N
12
import sys
from multiprocessing import Pool
def sanitize(w):
"""
If a token has been identified to contain
non-alphanumeric characters, such as punctuation,
assume it is leading or trailing punctuation
and trim them off. Other internal punctuation
is left intact.
"""
# Strip punctuation from the front
while len(w) > 0 and not w[0].isalnum():
w = w[1:]
# String punctuation from the back
while len(w) > 0 and not w[-1].isalnum():
w = w[:-1]
return w
def load(path):
"""
Load the contents the file at the given
path into a big string and return it.
"""
word_list = []
f = open(path, "r")
for line in f:
word_list.append (line)
# Efficiently concatenate Python string objects
return (''.join(word_list)).split ()
def chunks(l, n):
"""
A generator function for chopping up a given list into chunks of
length n.
"""
for i in xrange(0, len(l), n):
yield l[i:i+n]
def tuple_sort (a, b):
"""
Sort tuples by term frequency, and then alphabetically.
"""
if a[1] < b[1]:
return 1
elif a[1] > b[1]:
return -1
else:
return cmp(a[0], b[0])
def Map(L):
"""
Given a list of tokens, return a list of tuples of
titlecased (or proper noun) tokens and a count of '1'.
Also remove any leading or trailing punctuation from
each token.
"""
results = []
for w in L:
# True if w contains non-alphanumeric characters
if not w.isalnum():
w = sanitize(w)
# True if w is a title-cased token
if w.istitle():
results.append ((w, 1))
return results
def Partition(L):
"""
Group the sublists of (token, 1) pairs into a term-frequency-list
map, so that the Reduce operation later can work on sorted
term counts. The returned result is a dictionary with the structure
{token : [(token, 1), ...] .. }
"""
tf = {}
for sublist in L:
for p in sublist:
# Append the tuple to the list in the map
try:
tf[p[0]].append (p)
except KeyError:
tf[p[0]] = [p]
return tf
def Reduce(Mapping):
"""
Given a (token, [(token, 1) ...]) tuple, collapse all the
count tuples from the Map operation into a single term frequency
number for this token, and return a final tuple (token, frequency).
"""
return (Mapping[0], sum(pair[1] for pair in Mapping[1]))
# Load file, stuff it into a string
text = load('big.txt')
def compute_with_cores(cores=1):
# Build a pool of 8 processes
pool = Pool(processes=cores)
# Fragment the string data into 8 chunks
partitioned_text = list(chunks(text, len(text) /cores))
# Generate count tuples for title-cased tokens
single_count_tuples = pool.map(Map, partitioned_text)
# Organize the count tuples; lists of tuples by token key
token_to_tuples = Partition(single_count_tuples)
# Collapse the lists of tuples into total term frequencies
term_frequencies = pool.map(Reduce, token_to_tuples.items())
# Sort the term frequencies in nonincreasing order
term_frequencies.sort (tuple_sort)
for pair in term_frequencies[:3]:
print pair[0], ":", pair[1]
%%timeit
compute_with_cores(1)
I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 1 loops, best of 3: 1.81 s per loop
%%timeit
compute_with_cores(4)
I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 1 loops, best of 3: 1.6 s per loop
%%timeit
compute_with_cores(8)
I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 I : 7182 The : 7116 He : 2309 1 loops, best of 3: 1.85 s per loop