import os import sys sys.path.insert(0, '/Users/t/dev/word_cloud') from sklearn.feature_extraction.text import CountVectorizer from wordcloud import make_wordcloud from IPython.display import Image stop_words = ['http', 'org', '20', '80', 'www', 'titus', 'brown'] xx = CountVectorizer(stop_words="english") stop_words.extend(xx.get_stop_words()) def build_wordcloud_image(lines, output_filename='foo.png'): text = " ".join(lines) cv = CountVectorizer(charset_error="ignore", stop_words=stop_words, max_features=200) counts = cv.fit_transform([text]).toarray().ravel() words = np.array(cv.get_feature_names()) words = words[counts > 1] counts = counts[counts > 1] counts = make_wordcloud(words, counts, output_filename) return Image(filename=output_filename) def demo_wordcloud(): sources = ['/Users/t/dev/word_cloud/constitution.txt'] lines = [] for s in sources: with open(s) as f: lines.extend(f.readlines()) text = "".join(lines) return build_wordcloud_image(lines, 'out.png') demo_wordcloud() ls import mailbox box = mailbox.mbox('testing-in-python.mbox') all_messages = [ k for k in box ] print len(all_messages) import datetime def make_datetime(datestr): day = datestr[:25] d = datetime.datetime.strptime(day, "%a, %d %b %Y %H:%M:%S") return d def convert_all_to_dates(msgs): z = [] for k in all_messages: try: d = make_datetime(k['Date']) z.append(d) except: pass return z x = convert_all_to_dates(all_messages) start = x[0] y = [ (d - start).days for d in x ] hist(y, bins=200) authors = set([ k['From'] for k in all_messages ]) len(authors) fix_names = {} fix_names['=?ISO-8859-2?Q?Micha=B3_Kwiatkowski?='] = 'Michal Kwiatkowski' fix_names['C. Titus Brown'] = 'Titus Brown' fix_names['Doug Philips'] = 'Douglas Philips' fix_names['=?ISO-8859-1?Q?Tarek_Ziad=E9?='] = 'Tarek Ziade' def get_name(name): name = name.split('<', 1)[0] name = name.strip() name = name.strip("'") name = name.strip('"') if name in fix_names: name = fix_names[name] return name names = set([get_name(name) for name in authors ]) len(names) counts = {} for k in all_messages: name = get_name(k['From']) counts[name] = counts.get(name, 0) + 1 v = zip(range(len(counts)), reversed(sorted(counts.values()))) v = numpy.array(v) fig = pyplot.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(v[:,0], v[:,1]) #ax.set_yscale('log') ax.axis(xmax=50) counts_with_names = list(reversed(sorted([ (counts[name], name) for name in counts ]))) counts_with_names = zip(range(len(counts)), counts_with_names) counts_with_names[:50] def get_messages_by_author(author): for k in all_messages: a = get_name(k['From']) if a == author: yield k def messages_to_wordcloud(msgs): x = [] for msg in msgs: p = msg.get_payload() if type(p) != str: continue lines = p.splitlines() for line in lines: line = line.strip() if line.startswith('>') or line.startswith('On '): continue x.append(line) return build_wordcloud_image(x, "foo.png") x = [] for k in get_messages_by_author('Michael Foord'): p = k.get_payload() x.append(p.count('mock')) print sum(x), sum(x) / float(len(x)) messages_to_wordcloud(all_messages) x = get_messages_by_author('Michael Foord') messages_to_wordcloud(x) for rank, (num, author) in counts_with_names[:10]: x = get_messages_by_author(author) image = messages_to_wordcloud(x) print author display(image) x = get_messages_by_author("Terry Peppers") image = messages_to_wordcloud(x) display(image) k = all_messages[0] k['Subject'] threads = 0 started = {} for k in all_messages: if not k['Subject'].startswith('Re:'): name = get_name(k['From']) started[name] = started.get(name, 0) + 1 threads += 1 threads started_rank = list(reversed(sorted([ (v, k) for (k, v) in started.items() ]))) started_rank = zip(range(len(started_rank)), started_rank) list(started_rank)[:15]