print 'hello world'
plot(np.random.randn(1000).cumsum())
hello world
[<matplotlib.lines.Line2D at 0x115ba7050>]
cd stackexchange/
/Users/wesm/Dropbox/talks/20130614pyconsg/stackexchange
import pandas as pd
files = ['Python200901-07.csv',
'Python200907-201007.csv',
'Python201007-201107.csv',
'Python201107-12.csv',
'Python201201-07.csv',
'Python201207-12.csv',
'Python201301.csv']
tables = []
for path in files:
tables.append(pd.read_csv(path))
posts = pd.concat(tables, ignore_index=True)
posts.ix[0]
AnswerCount 7 CommentCount 1 CreationDate 2009-01-01 00:55:16 Id 404346 OwnerDisplayName marc lincoln OwnerUserId 47204 ParentId NaN PostTypeId 1 Score 2 Tags <python><math> Title Python program to calculate harmonic series Name: 0, dtype: object
posts.Tags
0 <python><math> 1 <python><binding><scope><identifier> 2 <python><windows><subprocess><popen> 3 <python><com><outlook><outlook-2007> 4 <python><windows><winapi><controls> 5 <python><opengl><3d><wxpython> 6 <python><xml><google-app-engine><parsing> 7 <python><networking> 8 <python><urlencode> 9 <python><django><django-templates> 10 <python><syntax-rules> 11 <python><google-app-engine> 12 <python><open-source><projects> 13 <python><global-variables> 14 <python><ctypes> ... 191976 <python><delete><pyside><qtreewidget><top-level> 191977 <python><response><serial><ussd><non-english> 191978 <python><bugs><args> 191979 <python><user-interface><coding-style><wxpython> 191980 <python><sockets> 191981 <python><django><setuptools><django-testing> 191982 <python><mysql><application> 191983 <c++><python><c><shared-libraries><python-c-ex... 191984 <python><soap><zsi> 191985 <python><r><rpy2><bioconductor> 191986 <python><greenlets> 191987 <python><pygame><python-3.3> 191988 <python><twisted> 191989 <python><multidimensional-array><numpy><indexing> 191990 <python><function><primes> Name: Tags, Length: 191991, dtype: object
import re
regex = re.compile('<([^>]*)>')
ids = []
tags = []
for id, val in zip(posts.Id, posts.Tags):
for tag in regex.findall(val):
ids.append(id)
tags.append(tag)
tag_table = pd.DataFrame({'subtag': tags, 'Id': ids})
tag_table.head()
Id | subtag | |
---|---|---|
0 | 404346 | python |
1 | 404346 | math |
2 | 404534 | python |
3 | 404534 | binding |
4 | 404534 | scope |
merged = pd.merge(tag_table, posts)
merged.ix[0]
Id 404346 subtag python AnswerCount 7 CommentCount 1 CreationDate 2009-01-01 00:55:16 OwnerDisplayName marc lincoln OwnerUserId 47204 ParentId NaN PostTypeId 1 Score 2 Tags <python><math> Title Python program to calculate harmonic series Name: 0, dtype: object
top = merged.groupby('subtag').size().order(ascending=False)[:500]
merged.CreationDate = pd.to_datetime(merged.CreationDate)
filtered = merged[merged.subtag.isin(top.index)]
grouped = filtered.groupby('subtag')
def agg_monthly(group):
return group.set_index('CreationDate').Score.resample('M', how='count')
results = grouped.apply(agg_monthly).unstack('subtag')
results = results[:'2013-05-31']
[x for x in top.index if 'meta' in x]
['metaclass', 'metaprogramming']
normed = results.div(results['python'], axis=0)
normed['metaprogramming'].plot()
<matplotlib.axes.AxesSubplot at 0x1111d1050>
results['python'].plot()
<matplotlib.axes.AxesSubplot at 0x128a40350>
normed = results.div(results['python'], axis=0)
to_analyze = normed[:'2013-05-31']
to_analyze['pandas'].plot()
<matplotlib.axes.AxesSubplot at 0x127ec7e90>
to_analyze['django'].plot()
<matplotlib.axes.AxesSubplot at 0x10965f7d0>
to_analyze['flask'].plot()
<matplotlib.axes.AxesSubplot at 0x109a9b890>
to_analyze['google-app-engine'].plot()
<matplotlib.axes.AxesSubplot at 0x111811950>
to_analyze['python-3.x'].plot()
<matplotlib.axes.AxesSubplot at 0x111680f10>
to_analyze['matplotlib'].plot()
<matplotlib.axes.AxesSubplot at 0x1118496d0>
to_analyze['regex'].plot()
<matplotlib.axes.AxesSubplot at 0x1132bba50>
top
subtag python 191991 django 20986 google-app-engine 6147 list 5577 numpy 5397 python-2.7 4953 regex 4598 python-3.x 4303 string 3600 matplotlib 3336 dictionary 3203 windows 2594 tkinter 2547 linux 2483 mysql 2433 ... variable-assignment 133 sublimetext2 133 httplib 133 django-orm 133 tweepy 132 tags 132 glade 132 merge 131 dbus 131 blobstore 131 website 130 mvc 130 autocomplete 130 find 129 decimal 129 Length: 500, dtype: int64
to_analyze['ironpython'].plot()
<matplotlib.axes.AxesSubplot at 0x11402e2d0>
to_analyze['twisted'].plot()
<matplotlib.axes.AxesSubplot at 0x114062390>
to_analyze['tornado'].plot()
<matplotlib.axes.AxesSubplot at 0x1164aab50>
filtered = merged[merged.subtag.isin(top.index)]
grouped = filtered.groupby('subtag')
def agg_monthly(group):
return group.set_index('CreationDate').Score.resample('A', how='count')
results = grouped.apply(agg_monthly).unstack('subtag')
n
results['django'].plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x116474310>
normed = results.div(results['python'], axis=0)
normed['django'].plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x1167e3190>
normed['django'].pct_change()
CreationDate 2009-12-31 NaN 2010-12-31 -0.014833 2011-12-31 -0.052872 2012-12-31 -0.116431 2013-12-31 -0.149268 Name: django, dtype: float64
whats_happening2013 = normed.pct_change().ix[-1]
[x for x in whats_happening2013.index if 'sublime' in x]
['sublimetext2']
whats_happening2013['sublimetext2']
0.36393813179275791
downtrends = whats_happening2013.order()[:50]
uptrends = whats_happening2013.order()[-50:]
uptrends[::-1]
subtag pylab NaN python-3.3 7.594584 user-interface 4.107720 raspberry-pi 3.542113 openerp 2.626263 enthought 2.061674 sympy 1.734087 python-2.7 1.197395 pandas 1.025044 scikit-learn 0.913183 python-import 0.782495 xlwt 0.738728 format 0.717129 flask-sqlalchemy 0.663131 nested 0.564855 matrix 0.547597 parameters 0.543436 python-multithreading 0.542176 node.js 0.535484 cx-freeze 0.534617 web-crawler 0.531504 histogram 0.518511 count 0.497674 return 0.488314 website 0.485113 3d 0.465417 xlrd 0.461540 mapreduce 0.457409 django-south 0.452425 graph 0.450032 python-3.x 0.433470 merge 0.424710 tkinter 0.412605 loops 0.412116 xml-parsing 0.380465 ipython 0.379742 python-requests 0.377138 serial-port 0.374284 sublimetext2 0.363938 printing 0.357086 decimal 0.349404 pyside 0.348702 pyinstaller 0.346390 pyserial 0.343298 networkx 0.333529 compare 0.322945 optimization 0.316720 flask 0.303641 split 0.297746 input 0.292707 Name: 2013-12-31 00:00:00, dtype: float64
downtrends
subtag plone -0.712494 osx-lion -0.679115 interpreter -0.647215 minidom -0.624765 osx-snow-leopard -0.603116 xmpp -0.597193 mod-python -0.588417 pylons -0.572587 qt4 -0.567036 pygtk -0.563162 winapi -0.562059 .net -0.559018 permissions -0.555490 irc -0.554376 wx -0.551000 objective-c -0.546419 boost-python -0.516837 reportlab -0.507864 gui -0.494472 metaclass -0.478071 gmail -0.461853 metaprogramming -0.437748 cookies -0.436203 model -0.435835 programming-languages -0.433023 deployment -0.424806 timezone -0.422715 blobstore -0.415119 introspection -0.404675 emacs -0.398895 design -0.394230 filesystems -0.381131 functional-programming -0.378079 decorator -0.374820 ruby-on-rails -0.372370 packaging -0.364986 forms -0.364986 com -0.358231 mako -0.358231 dll -0.356948 webserver -0.356405 django-admin -0.352027 jquery -0.346309 caching -0.344279 plugins -0.340563 pyparsing -0.338527 zip -0.338527 authentication -0.337576 boolean -0.332775 web.py -0.332775 Name: 2013-12-31 00:00:00, dtype: float64