Pandas library provides two data objects (built on top of numpy) that make working with time-series and tabular data much easier.
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import pandas as pd
!head data/players_dat.csv
date,user,gender,age,country,score,cash 2013-01-01,Abdul,M,20,FR,5,73.84 2013-01-04,Earle,M,36,SG,8,51.13 2013-01-05,Tyron,M,40,US,3,57.15 2013-01-08,Chris,M,24,FR,6,65.63 2013-01-09,Mark,M,24,FR,2,90.51 2013-01-12,May,F,28,US,3,46.32 2013-01-13,Charlene,F,43,US,3,27.27 2013-01-17,Ada,F,23,US,4,89.23 2013-01-18,Wei,F,43,US,8,74.80
df = pd.read_csv('data/players_dat.csv')
df.head(5)
date | user | gender | age | country | score | cash | |
---|---|---|---|---|---|---|---|
0 | 2013-01-01 | Abdul | M | 20 | FR | 5 | 73.84 |
1 | 2013-01-04 | Earle | M | 36 | SG | 8 | 51.13 |
2 | 2013-01-05 | Tyron | M | 40 | US | 3 | 57.15 |
3 | 2013-01-08 | Chris | M | 24 | FR | 6 | 65.63 |
4 | 2013-01-09 | Mark | M | 24 | FR | 2 | 90.51 |
type(df)
pandas.core.frame.DataFrame
df.sort('cash', ascending=0).head(5)
date | user | gender | age | country | score | cash | |
---|---|---|---|---|---|---|---|
16 | 2013-02-21 | Chris | M | 36 | SG | 2 | 99.40 |
10 | 2013-01-25 | Charlene | F | 40 | FR | 5 | 96.06 |
15 | 2013-02-19 | May | F | 38 | SG | 5 | 94.37 |
19 | 2013-02-26 | Mark | M | 20 | FR | 7 | 94.02 |
24 | 2013-03-10 | Wei | F | 21 | SG | 1 | 93.19 |
df.describe()
age | score | cash | |
---|---|---|---|
count | 30.000000 | 30.000000 | 30.000000 |
mean | 31.800000 | 4.333333 | 66.386000 |
std | 8.515301 | 2.509751 | 21.023171 |
min | 20.000000 | 1.000000 | 22.130000 |
25% | 24.000000 | 2.000000 | 52.145000 |
50% | 33.500000 | 4.000000 | 64.200000 |
75% | 39.750000 | 6.750000 | 86.902500 |
max | 44.000000 | 9.000000 | 99.400000 |
df.groupby(['gender']).score.mean().plot(kind='bar', title="Scores", rot=0)
<matplotlib.axes.AxesSubplot at 0x107198d10>
Machine learning in Python (PCA, ICA, LDA, SVM, k-means clustering,...)
(based on https://github.com/amueller/tutorial_ml_gkbionics)
from sklearn import datasets
# generate some clustered data (X == features, L == labels)
X, L = datasets.make_blobs(centers=4, cluster_std=0.5, random_state=2)
scatter(X[:,0], X[:,1])
<matplotlib.collections.PathCollection at 0x107a4a990>
scatter(X[:,0], X[:,1], c=L)
<matplotlib.collections.PathCollection at 0x107a7ac90>
from sklearn.cluster import KMeans
km = KMeans(4)
km.fit(X);
scatter(X[:,0], X[:,1], c=km.labels_)
<matplotlib.collections.PathCollection at 0x10963d5d0>
mu = km.cluster_centers_
print mu
scatter(X[:,0], X[:,1], c=km.labels_, alpha=0.5)
scatter(mu[:,0], mu[:,1], s=100, c=np.unique(km.labels_))
[[-5.77707127 2.47726437] [-1.40113893 -9.37815706] [-1.52252875 -3.51768581] [ 0.92452987 -1.21956947]]
<matplotlib.collections.PathCollection at 0x109622410>
Image processing in Python (scale, rotate, affine transform, colors processing, filtering, segmentation)
from skimage.data import camera
im_orig = camera()
print 'image shape:', im_orig.shape
print 'image (min,max):', im_orig.min(), im_orig.max()
gray() # set default colormap to gray scale
imshow(im_orig)
image shape: (512, 512) image (min,max): 0 255
<matplotlib.image.AxesImage at 0x10976a9d0>
from skimage.transform import resize, rotate, rescale
img = (resize(im_orig, (128, 128)) * 255).astype('uint16')
print img.dtype
print img.shape
print (img.min(), img.max())
uint16 (128, 128) (4, 252)
im_r = rotate(img.astype('float')/255, angle=15, order=2)
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.imshow(img)
plt.subplot(122)
plt.imshow(im_r)
plt.show()
h = hist(img)
# optimal threshold
from skimage.filter import threshold_otsu
thres = threshold_otsu(img)
print "otsu thres:", thres
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.imshow(img > 127)
plt.title("Manual")
plt.axis('off')
plt.subplot(122)
plt.imshow(img > thres)
plt.title("Otsu")
plt.axis('off')
otsu thres: 87
(-0.5, 127.5, 127.5, -0.5)
from skimage.filter import sobel
imshow(sobel(img))
<matplotlib.image.AxesImage at 0x10e8194d0>