import os
import sys
sys.path.insert(0, '/Users/t/dev/word_cloud')
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import make_wordcloud
from IPython.display import Image
stop_words = ['http', 'org', '20', '80', 'www', 'titus', 'brown']
xx = CountVectorizer(stop_words="english")
stop_words.extend(xx.get_stop_words())
def build_wordcloud_image(lines, output_filename='foo.png'):
text = " ".join(lines)
cv = CountVectorizer(charset_error="ignore",
stop_words=stop_words, max_features=200)
counts = cv.fit_transform([text]).toarray().ravel()
words = np.array(cv.get_feature_names())
words = words[counts > 1]
counts = counts[counts > 1]
counts = make_wordcloud(words, counts, output_filename)
return Image(filename=output_filename)
def demo_wordcloud():
sources = ['/Users/t/dev/word_cloud/constitution.txt']
lines = []
for s in sources:
with open(s) as f:
lines.extend(f.readlines())
text = "".join(lines)
return build_wordcloud_image(lines, 'out.png')
demo_wordcloud()
ls
boffin.ipynb foo.png testing-in-python.mbox foo.pdf out.png
import mailbox
box = mailbox.mbox('testing-in-python.mbox')
all_messages = [ k for k in box ]
print len(all_messages)
5461
import datetime
def make_datetime(datestr):
day = datestr[:25]
d = datetime.datetime.strptime(day, "%a, %d %b %Y %H:%M:%S")
return d
def convert_all_to_dates(msgs):
z = []
for k in all_messages:
try:
d = make_datetime(k['Date'])
z.append(d)
except:
pass
return z
x = convert_all_to_dates(all_messages)
start = x[0]
y = [ (d - start).days for d in x ]
hist(y, bins=200)
(array([ 90, 44, 6, 1, 21, 7, 9, 16, 0, 1, 5, 14, 3, 1, 4, 6, 3, 7, 5, 0, 5, 5, 4, 25, 37, 1, 15, 10, 8, 32, 32, 15, 19, 0, 12, 2, 5, 16, 10, 3, 7, 0, 0, 52, 2, 17, 99, 11, 33, 15, 2, 67, 34, 2, 1, 1, 34, 27, 2, 2, 0, 0, 10, 2, 0, 24, 1, 31, 47, 208, 187, 105, 57, 17, 3, 1, 13, 5, 39, 35, 10, 23, 40, 20, 49, 28, 14, 38, 0, 17, 36, 7, 42, 16, 2, 51, 11, 28, 11, 150, 64, 34, 23, 2, 18, 14, 5, 7, 15, 6, 2, 11, 73, 37, 57, 24, 4, 16, 78, 15, 34, 41, 31, 37, 3, 11, 13, 8, 36, 17, 12, 38, 37, 22, 103, 8, 20, 8, 1, 0, 16, 19, 25, 15, 16, 58, 19, 28, 43, 13, 18, 5, 3, 58, 12, 24, 25, 21, 36, 41, 13, 35, 40, 1, 12, 7, 13, 49, 7, 25, 20, 8, 19, 9, 1, 18, 20, 33, 15, 4, 14, 20, 1, 4, 6, 10, 17, 10, 5, 18, 8, 13, 20, 10, 12, 23, 15, 51, 10, 10]), array([ 0. , 11.04, 22.08, 33.12, 44.16, 55.2 , 66.24, 77.28, 88.32, 99.36, 110.4 , 121.44, 132.48, 143.52, 154.56, 165.6 , 176.64, 187.68, 198.72, 209.76, 220.8 , 231.84, 242.88, 253.92, 264.96, 276. , 287.04, 298.08, 309.12, 320.16, 331.2 , 342.24, 353.28, 364.32, 375.36, 386.4 , 397.44, 408.48, 419.52, 430.56, 441.6 , 452.64, 463.68, 474.72, 485.76, 496.8 , 507.84, 518.88, 529.92, 540.96, 552. , 563.04, 574.08, 585.12, 596.16, 607.2 , 618.24, 629.28, 640.32, 651.36, 662.4 , 673.44, 684.48, 695.52, 706.56, 717.6 , 728.64, 739.68, 750.72, 761.76, 772.8 , 783.84, 794.88, 805.92, 816.96, 828. , 839.04, 850.08, 861.12, 872.16, 883.2 , 894.24, 905.28, 916.32, 927.36, 938.4 , 949.44, 960.48, 971.52, 982.56, 993.6 , 1004.64, 1015.68, 1026.72, 1037.76, 1048.8 , 1059.84, 1070.88, 1081.92, 1092.96, 1104. , 1115.04, 1126.08, 1137.12, 1148.16, 1159.2 , 1170.24, 1181.28, 1192.32, 1203.36, 1214.4 , 1225.44, 1236.48, 1247.52, 1258.56, 1269.6 , 1280.64, 1291.68, 1302.72, 1313.76, 1324.8 , 1335.84, 1346.88, 1357.92, 1368.96, 1380. , 1391.04, 1402.08, 1413.12, 1424.16, 1435.2 , 1446.24, 1457.28, 1468.32, 1479.36, 1490.4 , 1501.44, 1512.48, 1523.52, 1534.56, 1545.6 , 1556.64, 1567.68, 1578.72, 1589.76, 1600.8 , 1611.84, 1622.88, 1633.92, 1644.96, 1656. , 1667.04, 1678.08, 1689.12, 1700.16, 1711.2 , 1722.24, 1733.28, 1744.32, 1755.36, 1766.4 , 1777.44, 1788.48, 1799.52, 1810.56, 1821.6 , 1832.64, 1843.68, 1854.72, 1865.76, 1876.8 , 1887.84, 1898.88, 1909.92, 1920.96, 1932. , 1943.04, 1954.08, 1965.12, 1976.16, 1987.2 , 1998.24, 2009.28, 2020.32, 2031.36, 2042.4 , 2053.44, 2064.48, 2075.52, 2086.56, 2097.6 , 2108.64, 2119.68, 2130.72, 2141.76, 2152.8 , 2163.84, 2174.88, 2185.92, 2196.96, 2208. ]), <a list of 200 Patch objects>)
authors = set([ k['From'] for k in all_messages ])
len(authors)
507
fix_names = {}
fix_names['=?ISO-8859-2?Q?Micha=B3_Kwiatkowski?='] = 'Michal Kwiatkowski'
fix_names['C. Titus Brown'] = 'Titus Brown'
fix_names['Doug Philips'] = 'Douglas Philips'
fix_names['=?ISO-8859-1?Q?Tarek_Ziad=E9?='] = 'Tarek Ziade'
def get_name(name):
name = name.split('<', 1)[0]
name = name.strip()
name = name.strip("'")
name = name.strip('"')
if name in fix_names:
name = fix_names[name]
return name
names = set([get_name(name) for name in authors ])
len(names)
441
counts = {}
for k in all_messages:
name = get_name(k['From'])
counts[name] = counts.get(name, 0) + 1
v = zip(range(len(counts)), reversed(sorted(counts.values())))
v = numpy.array(v)
fig = pyplot.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(v[:,0], v[:,1])
#ax.set_yscale('log')
ax.axis(xmax=50)
(0.0, 50, 0.0, 600.0)
counts_with_names = list(reversed(sorted([ (counts[name], name) for name in counts ])))
counts_with_names = zip(range(len(counts)), counts_with_names)
counts_with_names[:50]
[(0, (572, 'Michael Foord')), (1, (329, 'holger krekel')), (2, (305, 'Olemis Lang')), (3, (299, 'Titus Brown')), (4, (177, 'Kumar McMillan')), (5, (176, 'Robert Collins')), (6, (147, 'Chris Withers')), (7, (143, 'Ned Batchelder')), (8, (130, 'Jesse Noller')), (9, (112, 'Douglas Philips')), (10, (110, 'Ben Finney')), (11, (100, 'Alfredo Deza')), (12, (97, 'jason pellerin')), (13, (75, 'Grig Gheorghiu')), (14, (65, 'Marius Gedminas')), (15, (51, 'Laura Creighton')), (16, (51, 'Gary Bernhardt')), (17, (50, 'Benji York')), (18, (49, 'Noah Gift')), (19, (49, 'Mark Sienkiewicz')), (20, (46, 'Geoff Bache')), (21, (43, 'Michal Kwiatkowski')), (22, (43, 'Jonathan Lange')), (23, (41, 'Barry Warsaw')), (24, (36, 'Jorge Vargas')), (25, (36, 'Herman Sheremetyev')), (26, (36, 'Fernando Perez')), (27, (35, 'Terry Peppers')), (28, (35, 'Andrea Crotti')), (29, (31, 'Nicolas Chauvat')), (30, (30, 'Ronny Pfannschmidt')), (31, (29, 'Pete')), (32, (28, 'Fred Drake')), (33, (27, 'Raphael Marvie')), (34, (27, 'Paul Hildebrandt')), (35, (26, 'Victoria G. Laidler')), (36, (25, 'Tarek Ziade')), (37, (25, 'Nagappan Alagappan')), (38, (25, 'Andrew Bennetts')), (39, (24, 'Nate Lowrie')), (40, (24, 'Bob Clancy')), (41, (24, 'Andrew Dalke')), (42, (23, 'Arve Knudsen')), (43, (22, 'exarkun@twistedmatrix.com')), (44, (22, 'Phlip')), (45, (21, 'andrea crotti')), (46, (21, 'John Wong')), (47, (21, 'Jim Fulton')), (48, (21, 'Chris Jerdonek')), (49, (20, 'Mark Roddy'))]
def get_messages_by_author(author):
for k in all_messages:
a = get_name(k['From'])
if a == author:
yield k
def messages_to_wordcloud(msgs):
x = []
for msg in msgs:
p = msg.get_payload()
if type(p) != str:
continue
lines = p.splitlines()
for line in lines:
line = line.strip()
if line.startswith('>') or line.startswith('On '):
continue
x.append(line)
return build_wordcloud_image(x, "foo.png")
x = []
for k in get_messages_by_author('Michael Foord'):
p = k.get_payload()
x.append(p.count('mock'))
print sum(x), sum(x) / float(len(x))
983 1.71853146853
messages_to_wordcloud(all_messages)
x = get_messages_by_author('Michael Foord')
messages_to_wordcloud(x)
for rank, (num, author) in counts_with_names[:10]:
x = get_messages_by_author(author)
image = messages_to_wordcloud(x)
print author
display(image)
Michael Foord
holger krekel
Olemis Lang
Titus Brown
Kumar McMillan
Robert Collins
Chris Withers
Ned Batchelder
Jesse Noller
Douglas Philips
x = get_messages_by_author("Terry Peppers")
image = messages_to_wordcloud(x)
display(image)
k = all_messages[0]
k['Subject']
'[pytesting] Testing 123'
threads = 0
started = {}
for k in all_messages:
if not k['Subject'].startswith('Re:'):
name = get_name(k['From'])
started[name] = started.get(name, 0) + 1
threads += 1
threads
1029
started_rank = list(reversed(sorted([ (v, k) for (k, v) in started.items() ])))
started_rank = zip(range(len(started_rank)), started_rank)
list(started_rank)[:15]
[(0, (69, 'holger krekel')), (1, (51, 'Michael Foord')), (2, (47, 'Chris Withers')), (3, (45, 'Titus Brown')), (4, (40, 'Olemis Lang')), (5, (34, 'jason pellerin')), (6, (30, 'Alfredo Deza')), (7, (20, 'Kumar McMillan')), (8, (19, 'Nagappan Alagappan')), (9, (18, 'Geoff Bache')), (10, (16, 'Ned Batchelder')), (11, (14, 'Noah Gift')), (12, (13, 'Terry Peppers')), (13, (11, 'Robert Collins')), (14, (11, 'Nate Lowrie'))]