sentence = "This cat jumped over this other cat!" def stem(word): """ Stem word to primitive form >>> stem("Hello!") 'hello' """ return word.lower().rstrip(",.!)-*_?:;$'-\"").lstrip("-*'\"(_$'") def wordcount(string): words = string.split() stemmed_words = [] for word in words: stemmed_words.append(stem(word)) counts = dict() for word in stemmed_words: if word not in counts: counts[word] = 1 else: counts[word] += 1 return counts wordcount(sentence) from toolz import frequencies frequencies(map(stem, sentence.split())) from toolz import pipe # Simple example def double(x): return 2 * x pipe(3, double, double, str) from toolz.curried import map pipe(sentence, str.split, map(stem), frequencies) def wordcount(file): counts = dict() for line in file: words = line.split() stemmed_words = [] for word in words: stemmed_words.append(stem(word)) for word in stemmed_words: if word not in counts: counts[word] = 1 else: counts[word] += 1 return counts with open('data/tale-of-two-cities.txt') as f: for i in range(112): # Burn first 112 lines - they include the Gutenberg header next(f) result = wordcount(f) result from toolz import concat from toolz.curried import drop pipe('data/tale-of-two-cities.txt', open, drop(112), map(str.split), concat, map(stem), frequencies) timeit pipe('data/tale-of-two-cities.txt', open, drop(112), map(str.split), concat, map(stem), frequencies) timeit with open('data/tale-of-two-cities.txt') as f: wordcount(f)