Brevísimo paseo por pandas
, statsmodels
y scikit-learn
.
# As usual
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,5,np.nan,6,8]) # creamos una serie
s
0 1 1 3 2 5 3 NaN 4 6 5 8 dtype: float64
dates = pd.date_range('20130101',periods=6)
dates
<class 'pandas.tseries.index.DatetimeIndex'> [2013-01-01 00:00:00, ..., 2013-01-06 00:00:00] Length: 6, Freq: D, Timezone: None
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) # creamos un dataframe
df
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.989619 | -0.107324 | 0.051252 | 0.314584 |
2013-01-02 | 0.735731 | -1.889398 | -0.015217 | -0.754001 |
2013-01-03 | 0.442580 | 0.837183 | -0.737065 | -1.483950 |
2013-01-04 | 0.468236 | -1.159737 | -1.633779 | -0.380115 |
2013-01-05 | 0.020721 | -1.004873 | 1.236132 | -1.916720 |
2013-01-06 | 0.053638 | 1.371354 | -1.152312 | -1.204630 |
# creamos un df a partir de un diccionario
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=range(4),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : 'foo' })
df2
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 1 | 2013-01-02 00:00:00 | 1 | 3 | foo |
1 | 1 | 2013-01-02 00:00:00 | 1 | 3 | foo |
2 | 1 | 2013-01-02 00:00:00 | 1 | 3 | foo |
3 | 1 | 2013-01-02 00:00:00 | 1 | 3 | foo |
df2.dtypes
A float64 B datetime64[ns] C float32 D int32 E object dtype: object
df.head()
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.989619 | -0.107324 | 0.051252 | 0.314584 |
2013-01-02 | 0.735731 | -1.889398 | -0.015217 | -0.754001 |
2013-01-03 | 0.442580 | 0.837183 | -0.737065 | -1.483950 |
2013-01-04 | 0.468236 | -1.159737 | -1.633779 | -0.380115 |
2013-01-05 | 0.020721 | -1.004873 | 1.236132 | -1.916720 |
df.tail(3)
A | B | C | D | |
---|---|---|---|---|
2013-01-04 | 0.468236 | -1.159737 | -1.633779 | -0.380115 |
2013-01-05 | 0.020721 | -1.004873 | 1.236132 | -1.916720 |
2013-01-06 | 0.053638 | 1.371354 | -1.152312 | -1.204630 |
df.index
<class 'pandas.tseries.index.DatetimeIndex'> [2013-01-01 00:00:00, ..., 2013-01-06 00:00:00] Length: 6, Freq: D, Timezone: None
df.columns
Index([u'A', u'B', u'C', u'D'], dtype=object)
df.values
array([[ 0.98961883, -0.10732398, 0.05125175, 0.3145839 ], [ 0.73573078, -1.88939812, -0.01521745, -0.75400056], [ 0.44257962, 0.83718315, -0.73706537, -1.48394989], [ 0.46823576, -1.15973698, -1.63377881, -0.38011494], [ 0.02072137, -1.00487349, 1.23613156, -1.91671962], [ 0.05363753, 1.37135419, -1.15231187, -1.20463048]])
df.describe
<bound method DataFrame.describe of A B C D 2013-01-01 0.989619 -0.107324 0.051252 0.314584 2013-01-02 0.735731 -1.889398 -0.015217 -0.754001 2013-01-03 0.442580 0.837183 -0.737065 -1.483950 2013-01-04 0.468236 -1.159737 -1.633779 -0.380115 2013-01-05 0.020721 -1.004873 1.236132 -1.916720 2013-01-06 0.053638 1.371354 -1.152312 -1.204630>
df.T
2013-01-01 00:00:00 | 2013-01-02 00:00:00 | 2013-01-03 00:00:00 | 2013-01-04 00:00:00 | 2013-01-05 00:00:00 | 2013-01-06 00:00:00 | |
---|---|---|---|---|---|---|
A | 0.989619 | 0.735731 | 0.442580 | 0.468236 | 0.020721 | 0.053638 |
B | -0.107324 | -1.889398 | 0.837183 | -1.159737 | -1.004873 | 1.371354 |
C | 0.051252 | -0.015217 | -0.737065 | -1.633779 | 1.236132 | -1.152312 |
D | 0.314584 | -0.754001 | -1.483950 | -0.380115 | -1.916720 | -1.204630 |
df.sort_index(axis=1, ascending=False)
D | C | B | A | |
---|---|---|---|---|
2013-01-01 | 0.314584 | 0.051252 | -0.107324 | 0.989619 |
2013-01-02 | -0.754001 | -0.015217 | -1.889398 | 0.735731 |
2013-01-03 | -1.483950 | -0.737065 | 0.837183 | 0.442580 |
2013-01-04 | -0.380115 | -1.633779 | -1.159737 | 0.468236 |
2013-01-05 | -1.916720 | 1.236132 | -1.004873 | 0.020721 |
2013-01-06 | -1.204630 | -1.152312 | 1.371354 | 0.053638 |
df.sort(columns='B')
A | B | C | D | |
---|---|---|---|---|
2013-01-02 | 0.735731 | -1.889398 | -0.015217 | -0.754001 |
2013-01-04 | 0.468236 | -1.159737 | -1.633779 | -0.380115 |
2013-01-05 | 0.020721 | -1.004873 | 1.236132 | -1.916720 |
2013-01-01 | 0.989619 | -0.107324 | 0.051252 | 0.314584 |
2013-01-03 | 0.442580 | 0.837183 | -0.737065 | -1.483950 |
2013-01-06 | 0.053638 | 1.371354 | -1.152312 | -1.204630 |
df['A']
2013-01-01 0.989619 2013-01-02 0.735731 2013-01-03 0.442580 2013-01-04 0.468236 2013-01-05 0.020721 2013-01-06 0.053638 Freq: D, Name: A, dtype: float64
df[0:3]
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.989619 | -0.107324 | 0.051252 | 0.314584 |
2013-01-02 | 0.735731 | -1.889398 | -0.015217 | -0.754001 |
2013-01-03 | 0.442580 | 0.837183 | -0.737065 | -1.483950 |
df['20130102':'20130104']
A | B | C | D | |
---|---|---|---|---|
2013-01-02 | 0.735731 | -1.889398 | -0.015217 | -0.754001 |
2013-01-03 | 0.442580 | 0.837183 | -0.737065 | -1.483950 |
2013-01-04 | 0.468236 | -1.159737 | -1.633779 | -0.380115 |
df.mean()
A 0.451754 B -0.325466 C -0.375165 D -0.904139 dtype: float64
df.mean(1)
2013-01-01 0.312033 2013-01-02 -0.480721 2013-01-03 -0.235313 2013-01-04 -0.676349 2013-01-05 -0.416185 2013-01-06 -0.232988 Freq: D, dtype: float64
df.apply(np.cumsum)
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.989619 | -0.107324 | 0.051252 | 0.314584 |
2013-01-02 | 1.725350 | -1.996722 | 0.036034 | -0.439417 |
2013-01-03 | 2.167929 | -1.159539 | -0.701031 | -1.923367 |
2013-01-04 | 2.636165 | -2.319276 | -2.334810 | -2.303481 |
2013-01-05 | 2.656886 | -3.324149 | -1.098678 | -4.220201 |
2013-01-06 | 2.710524 | -1.952795 | -2.250990 | -5.424832 |
df = pd.DataFrame(np.random.randn(10, 4))
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.227153 | 0.917822 | -1.608276 | 0.419924 |
1 | 0.725317 | 1.045716 | -0.322178 | 0.212421 |
2 | 0.465058 | 0.539302 | -1.046666 | -0.243255 |
3 | 0.101423 | 0.957514 | 0.236958 | -0.400488 |
4 | -0.096053 | 1.488869 | -0.329410 | -1.696989 |
5 | 0.449934 | 1.279246 | 1.173663 | -0.274385 |
6 | -0.368607 | 0.883130 | -1.665743 | 0.626339 |
7 | -0.634410 | -0.528353 | 1.691467 | 0.374490 |
8 | -1.811590 | -0.421780 | 0.627348 | -0.089724 |
9 | 1.421926 | 0.517931 | 2.129783 | 0.508625 |
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.227153 | 0.917822 | -1.608276 | 0.419924 |
1 | 0.725317 | 1.045716 | -0.322178 | 0.212421 |
2 | 0.465058 | 0.539302 | -1.046666 | -0.243255 |
3 | 0.101423 | 0.957514 | 0.236958 | -0.400488 |
4 | -0.096053 | 1.488869 | -0.329410 | -1.696989 |
5 | 0.449934 | 1.279246 | 1.173663 | -0.274385 |
6 | -0.368607 | 0.883130 | -1.665743 | 0.626339 |
7 | -0.634410 | -0.528353 | 1.691467 | 0.374490 |
8 | -1.811590 | -0.421780 | 0.627348 | -0.089724 |
9 | 1.421926 | 0.517931 | 2.129783 | 0.508625 |
# groupby
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | -0.408356 | 1.440525 |
1 | bar | one | -1.332099 | -0.868338 |
2 | foo | two | -0.296245 | -0.078245 |
3 | bar | three | -2.337543 | 0.382080 |
4 | foo | two | 1.059835 | 1.553611 |
5 | bar | two | -0.538388 | -1.696707 |
6 | foo | one | -0.106452 | -2.339282 |
7 | foo | three | 0.516394 | -0.555832 |
df.groupby('A').sum()
C | D | |
---|---|---|
A | ||
bar | -4.208029 | -2.182965 |
foo | 0.765176 | 0.020777 |
In [86]: df.groupby(['A','B']).sum()
C | D | ||
---|---|---|---|
A | B | ||
bar | one | -1.332099 | -0.868338 |
three | -2.337543 | 0.382080 | |
two | -0.538388 | -1.696707 | |
foo | one | -0.514808 | -0.898757 |
three | 0.516394 | -0.555832 | |
two | 0.763590 | 1.475365 |
# tablas pivotantes
df = pd.DataFrame({ 'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
A | B | C | D | E | |
---|---|---|---|---|---|
0 | one | A | foo | 1.033876 | 0.452061 |
1 | one | B | foo | 0.039088 | 2.136289 |
2 | two | C | foo | 0.188529 | -1.290183 |
3 | three | A | bar | 1.046199 | -1.762031 |
4 | one | B | bar | 0.900097 | -0.274119 |
5 | one | C | bar | 1.464419 | -0.034841 |
6 | two | A | foo | -0.889491 | -1.102912 |
7 | three | B | foo | -0.479741 | 0.144639 |
8 | one | C | foo | 1.257836 | 0.628220 |
9 | one | A | bar | -1.126975 | 0.760057 |
10 | two | B | bar | -1.531827 | -0.426839 |
11 | three | C | bar | -0.834446 | 1.854987 |
pd.pivot_table(df, values='D', rows=['A', 'B'], cols=['C'])
C | bar | foo | |
---|---|---|---|
A | B | ||
one | A | -1.126975 | 1.033876 |
B | 0.900097 | 0.039088 | |
C | 1.464419 | 1.257836 | |
three | A | 1.046199 | NaN |
B | NaN | -0.479741 | |
C | -0.834446 | NaN | |
two | A | NaN | -0.889491 |
B | -1.531827 | NaN | |
C | NaN | 0.188529 |
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure();
df.plot();
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2d2cad0>
<matplotlib.figure.Figure at 0x2d2c690>
import statsmodels.api as sm
nsample = 50
sig = 0.25
x1 = np.linspace(0, 20, nsample)
X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]
beta = [0.5, 0.5, -0.02, 5.]
y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)
olsmod = sm.OLS(y, X)
olsres = olsmod.fit()
print olsres.params
print olsres.bse
[ 0.48891003 0.52398592 -0.01926634 5.00909221] [ 0.01557795 0.06123875 0.00136776 0.10100801]
ypred = olsres.predict(X)
x1n = np.linspace(20.5,25, 10)
Xnew = np.c_[x1n, np.sin(x1n), (x1n-5)**2, np.ones(10)]
ynewpred = olsres.predict(Xnew) # predict out of sample
print ypred
plt.figure()
plt.plot(x1, y, 'o', x1, y_true, 'b-')
plt.plot(np.hstack((x1, x1n)), np.hstack((ypred, ynewpred)),'r')
plt.title('OLS, azul: verdadero, rojo: valores predichos')
[ 4.52743362 5.01039982 5.45277562 5.82600424 6.1118349 6.30532138 6.4156346 6.46455598 6.48289869 6.50544524 6.56523273 6.68812502 6.88856329 7.16719347 7.51076009 7.89428408 8.28516682 8.64854595 8.95302321 9.17582301 9.30653449 9.34882191 9.31982267 9.24733166 9.16523466 9.107941 9.10473016 9.17494132 9.32479589 9.54637543 9.81892258 10.11225146 10.39170709 10.62385916 10.7819945 10.85050596 10.82745667 10.72489823 10.56689066 10.38554944 10.21576899 10.08948963 10.0304499 10.05028635 10.14662169 10.30345539 10.49379254 10.68407897 10.83971418 10.93073606]
<matplotlib.text.Text at 0x3ebb090>
import numpy as np
import pylab as pl
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "Naive Bayes", "LDA", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
GaussianNB(),
LDA(),
QDA()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
figure = pl.figure(figsize=(24, 8))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = pl.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will asign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
pl.show()