%load_ext autoreload
%autoreload 2
%pylab inline
%config InlineBackend.figure_format = 'svg'
import seaborn as sns
# nice / large graphs
sns.set_context("notebook")
plt.rcParams["figure.figsize"] = (8, 5)
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
iris['data'][:5, :]
iris['target'][:5]
.csv
file with pandas¶df = pd.DataFrame(iris['data'], columns=iris.feature_names)
df['target'] = iris['target']
df.target = df.target.map(dict(zip(range(3), iris.target_names)))
df.to_csv('iris.csv', index=False)
import pandas as pd
df = pd.read_csv('iris.csv')
df.head()
y = df['target'].map({'setosa': 0, 'versicolor': 1, 'virginica': 2})
X = df.drop('target', 1)
X.head()
y.value_counts()
# filtering missing values
X = X.dropna()
# cleaning data
X = X[X['sepal length (cm)'] > 0]
# standardization
X = (X - X.mean()) / X.std()
# dummy variables
pd.get_dummies(df['target']).head()
# new features
X['sepal length (cm)^2'] = X['sepal length (cm)']**2
X.head()
news = datasets.fetch_20newsgroups(subset='train', categories=['alt.atheism', 'soc.religion.christian'])
print(news.data[2][:600])
from sklearn.feature_extraction.text import CountVectorizer
# bag of words representation
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(news.data)
X_counts
# sparseness
X_counts.sum() / (X_counts.shape[0] * X_counts.shape[1])
= normalize your data to have zero mean and unit stadard deviation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(iris['data'])
scaler.mean_, scaler.scale_
X_iris_transformed = scaler.transform(iris['data'])
X_iris_transformed[:, 0].mean()
import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
imp.transform([[np.nan, np.nan]])
Framework for chaining feauture extractors, vectorizers and estimators.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
text_pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])
text_pipe
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf
clf = RandomForestClassifier(random_state=3, max_depth=5, n_jobs=-1)
clf
from sklearn.model_selection import train_test_split
X = iris['data']
y = iris['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
print('Train size: {}'.format(len(X_train)))
print('Test size: {}'.format(len(X_test)))
%%time
clf.fit(X_train, y_train)
import pydotplus as pydot
from IPython.core.display import Image, SVG
from io import StringIO
from sklearn.tree import export_graphviz
def show_tree(tree, max_depth=3, height=600, width=600, feature_names=None):
io = StringIO()
export_graphviz(tree, out_file=io, max_depth=max_depth, feature_names=feature_names)
graph = pydot.graph_from_dot_data(io.getvalue())
return Image(graph.create_png(), height=height, width=width)
show_tree(clf.estimators_[0], feature_names=iris.feature_names)
clf.predict(X_test[:5])
clf.predict_proba(X_test[:5, :])
import pickle
# saving your model
with open('myclassifier.pkl', 'wb') as f:
pickle.dump(clf, f)
# somewhere far far away in production
with open('myclassifier.pkl', 'rb') as f:
clf = pickle.load(f)
clf
from sklearn import metrics
y_pred = clf.predict(X_train)
print(metrics.classification_report(y_train, y_pred, target_names=iris.target_names))
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=iris.target_names))
ax = pd.Series(clf.feature_importances_, index=iris.feature_names).plot(kind='barh', figsize=(5, 5))
ax.set(xlabel='Importance');
from sklearn.datasets import load_digits
from sklearn.model_selection import validation_curve
from sklearn.naive_bayes import GaussianNB
digits = load_digits()
X_digits, _, y_digits, _ = train_test_split(digits.data, digits.target, test_size=0)
plt.imshow(X_digits[0, :].reshape(8, 8), cmap=plt.cm.gray_r, interpolation='nearest')
from sklearn.model_selection import learning_curve
estimator = GaussianNB()
train_sizes, train_scores, test_scores = learning_curve(
estimator,
X_digits, y_digits,
train_sizes=range(100, 1195, 100),
n_jobs=-1
)
plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', label='Training score')
plt.plot(train_sizes, test_scores.mean(axis=1), 'o-', label='Cross-validation score')
plt.legend(loc="best");
from sklearn.model_selection import cross_val_score
import numpy as np
cv = 5
scores = cross_val_score(clf, iris.data, iris.target, cv=cv, scoring='accuracy')
print('CV scores: {:.3f} +- {:.3f}'.format(scores.mean(), scores.std()))
clf = RandomForestClassifier()
clf
from sklearn.model_selection import GridSearchCV
parameters = {
'max_depth': [3, 5, 10],
'min_samples_leaf': [1, 5, 10, 30],
}
gcv = GridSearchCV(clf, parameters, scoring='accuracy', n_jobs=-1)
gcv.fit(X_train, y_train)
for params, mean_test_score, std_test_score in zip(gcv.cv_results_['params'],
gcv.cv_results_['mean_test_score'],
gcv.cv_results_['std_test_score']):
print('{}: {:.3f} +- {:.3f}'.format(params, mean_test_score, std_test_score))
print('\nBest:')
print(gcv.best_params_)
clf.set_params(**gcv.best_params_).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=iris.target_names))