import numpy as np ## vector and matrix operations
import scipy as sp ## grab-bag of statistical and science tools
import matplotlib.pyplot as plt ## matplotlib - plots
import pandas as pd ## emulates R data frames
import statsmodels.api as sm ## scikits.statsmodels - statistics library
import patsy ## emulates R model formulas
import sklearn as skl ## scikits.learn - machine learning library
from sklearn import mixture as sklmix
Linux (Ubuntu)
(sudo) pip install [package]
.
Mac
Windows
x = np.linspace(0, 10, 100)
y = np.random.normal(loc=x, scale=1.0)
plt.plot(x, y, 'bo')
[<matplotlib.lines.Line2D at 0x58f8290>]
ols_model = sm.OLS(y, x)
ols_fit = ols_model.fit()
yhat = ols_fit.predict()
print ols_fit.summary()
plt.plot(x, y, 'bo', x, yhat, 'r-', lw=2)
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.896 Model: OLS Adj. R-squared: 0.896 Method: Least Squares F-statistic: inf Date: Tue, 11 Jun 2013 Prob (F-statistic): nan Time: 11:03:43 Log-Likelihood: -143.13 No. Observations: 100 AIC: 288.3 Df Residuals: 99 BIC: 290.9 Df Model: 0 ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ x1 1.0084 0.018 57.360 0.000 0.974 1.043 ============================================================================== Omnibus: 1.215 Durbin-Watson: 1.854 Prob(Omnibus): 0.545 Jarque-Bera (JB): 1.193 Skew: 0.257 Prob(JB): 0.551 Kurtosis: 2.851 Cond. No. 1.00 ==============================================================================
[<matplotlib.lines.Line2D at 0x5b35110>, <matplotlib.lines.Line2D at 0x5b35310>]
iris = pd.read_csv('iris.csv')
print iris, '\n'
print iris.head(10)
<class 'pandas.core.frame.DataFrame'> Int64Index: 150 entries, 0 to 149 Data columns: Type 150 non-null values PW 150 non-null values PL 150 non-null values SW 150 non-null values SL 150 non-null values dtypes: int64(5) Type PW PL SW SL 0 0 2 14 33 50 1 1 24 56 31 67 2 1 23 51 31 69 3 0 2 10 36 46 4 1 20 52 30 65 5 1 19 51 27 58 6 2 13 45 28 57 7 2 16 47 33 63 8 1 17 45 25 49 9 2 14 47 32 70
print np.round(iris.describe(), 2)
Type PW PL SW SL count 150.00 150.00 150.00 150.00 150.00 mean 1.00 11.93 37.79 30.55 58.45 std 0.82 7.57 17.78 4.37 8.27 min 0.00 1.00 10.00 20.00 43.00 25% 0.00 3.00 16.00 28.00 51.00 50% 1.00 13.00 44.00 30.00 58.00 75% 2.00 18.00 51.00 33.00 64.00 max 2.00 25.00 69.00 44.00 79.00
type_grp = iris.groupby('Type')
print "number of groups:", type_grp.ngroups, "\n"
print "maximums:\n", type_grp.max(), "\n"
print "size of each group:\n", type_grp.size(), "\n"
number of groups: 3 maximums: PW PL SW SL Type 0 6 19 44 58 1 25 69 38 79 2 18 56 34 70 size of each group: Type 0 50 1 50 2 50
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(iris['PW'], iris['PL'], iris['SW'], c=iris['Type'])
ax.set_xlabel('PW')
ax.set_ylabel('PL')
ax.set_zlabel('SW')
ax.azim = 140
ax.elev = 15
whos
Variable Type Data/Info ------------------------------------------------- Axes3D type <class 'mpl_toolkits.mplot3d.axes3d.Axes3D'> ax Axes3DSubplot Axes(0.125,0.125;0.775x0.775) fig Figure Figure(640x640) iris DataFrame <class 'pandas.core.frame<...> values\ndtypes: int64(5) ols_fit RegressionResultsWrapper <statsmodels.regression.l<...>pper object at 0x5b03c90> ols_model OLS <statsmodels.regression.l<...>.OLS object at 0x44b28d0> patsy module <module 'patsy' from '/us<...>ages/patsy/__init__.pyc'> pd module <module 'pandas' from '/u<...>ges/pandas/__init__.pyc'> skl module <module 'sklearn' from '/<...>es/sklearn/__init__.pyc'> sklmix module <module 'sklearn.mixture'<...>rn/mixture/__init__.pyc'> sm module <module 'statsmodels.api'<...>2.7/statsmodels/api.pyc'> sp module <module 'scipy' from '/us<...>ages/scipy/__init__.pyc'> type_grp DataFrameGroupBy <pandas.core.groupby.Data<...>upBy object at 0x5b40a90> x ndarray 100: 100 elems, type `float64`, 800 bytes y ndarray 100: 100 elems, type `float64`, 800 bytes yhat ndarray 100: 100 elems, type `float64`, 800 bytes
# %pdoc sklmix.GMM
# %psource sklmix.GMM
#sklmix.GMM?
#sklmix.GMM??
gmm_model = sklmix.GMM(n_components=3, covariance_type='full')
gmm_model.fit(iris[['PW', 'PL', 'SW']])
yhat = gmm_model.predict(iris[['PW', 'PL', 'SW']])
crosstab = pd.crosstab(iris['Type'], yhat, rownames=['true'], colnames=['predicted'])
print crosstab
predicted 0 1 2 true 0 50 0 0 1 0 13 37 2 0 45 5
import munkres
import sys
m = munkres.Munkres()
cost = munkres.make_cost_matrix(crosstab.values.tolist(), lambda x : sys.maxint - x)
align = m.compute(cost)
print align, '\n'
permute = [x[1] for x in align]
new_label = np.argsort(permute)
yhat_new = new_label[yhat]
print pd.crosstab(iris['Type'], yhat_new, rownames=['true'], colnames=['predicted'])
[(0, 0), (1, 2), (2, 1)] predicted 0 1 2 true 0 50 0 0 1 0 37 13 2 0 5 45
from rpy2.robjects import r
from rpy2.robjects.numpy2ri import numpy2ri as np2r
Xr = np2r(iris[['PW', 'PL', 'SW']].values)
d = r.dist(Xr)
tree = r.hclust(d, method='ward')
yhat_hclust = r.cutree(tree, k=3)
print pd.crosstab(iris['Type'], yhat_hclust, rownames=['true'], colnames=['predicted'])
predicted 1 2 3 true 0 50 0 0 1 0 35 15 2 0 0 50
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r
e1071 = importr('e1071')
Yr = np2r(iris['Type'])
Yr = r.factor(Yr)
svm = e1071.svm(Xr, Yr)
yhat = r.predict(svm, Xr)
print r.table(yhat, Yr)
0 1 2 0 50 0 0 1 0 46 2 2 0 4 48
Thanks to Fei Yu for this vignette.
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r
r.library("ggplot2")
rnorm = r["rnorm"]
dtf = robjects.DataFrame({'x': rnorm(300, mean=0) + rnorm(100, mean=3),
'y': rnorm(300, mean=0) + rnorm(100, mean=3)})
robjects.globalenv['dtf'] = dtf # assign to R's environment
r("gp = ggplot(dtf, aes(x, y))")
r("pp = gp + geom_point(size=0.6)")
# r("plot(pp)") # I can't get this to work on my system, but saving the plot is just as good.
r("ggsave(filename='test.png', plot=pp, scale=0.3)")
from IPython.core.display import Image
Image(filename='test.png')
Saving 2.1 x 2.1 in image
Relevant links:
For Ubuntu:
sudo apt-get install ipython-notebook
ipython notebook --pylab=inline --script
nbconvert latex statBytes-python
pdflatex statBytes-python.tex
The --pylab==inline
flag embeds matplotlib output in the notebook rather than popping up a new window for each figure. The --script
flat saves a .py
script along with the .ipynb
file which is ready to be run in a regular python or ipython prompt.
There are (at least) two ways to post a notebook online. For this notebook I pushed the .ipynb file to a public github repo, open the file on the github website, clicked the "raw" button, and pasted the URL into the box at the top of the nbviewer website. Pretty straightforward.