mu, sigma = 0, 0.1 s = np.random.normal(mu, sigma, 10000) cpython_s = list(s) %timeit sorted(cpython_s) %timeit s.sort() count, bins, _ = hist(s, bins=30, normed=True) def probability_density(bins, mu, sigma): return 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2)) count, bins, _ = hist(s, bins=30, normed=True) plot(bins, probability_density(bins, mu, sigma), "r") from scipy.misc import lena from scipy.ndimage.filters import sobel imshow(lena(), cmap=cm.gray) imshow(sobel(lena()), cmap=cm.gray) from sklearn import linear_model, datasets iris = datasets.load_iris() import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import datasets from sklearn.decomposition import PCA # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. Y = iris.target x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 plt.figure(2) plt.clf() # Plot the training points plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) # To getter a better understanding of interaction of the dimensions # plot the first three PCA dimensions fig = plt.figure(1) ax = Axes3D(fig, elev=-150, azim=110) X_reduced = PCA(n_components=3).fit_transform(iris.data) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y, cmap=plt.cm.Paired) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) plt.show() iris from sklearn.cross_validation import train_test_split iris_full = np.c_[iris.data, iris.target] iris_train, iris_test = train_test_split(iris_full, test_size=10) len(iris_train), len(iris_test) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(iris_train[:,:-1], iris_train[:,-1]) logreg.predict(iris_test[:, :-1]) iris_test[:, -1] X = iris.data[:, :2] # we only take the first two features. Y = iris.target h = .02 # step size in the mesh logreg = linear_model.LogisticRegression(C=1e5) # we create an instance of Neighbours Classifier and fit the data. logreg.fit(X, Y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.show() import pandas as pd u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols) users r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols) m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5)) movies ratings movie_ratings = pd.merge(movies, ratings, on="movie_id") # if `on` is None, it can be inferred! lens = pd.merge(movie_ratings, users) lens.groupby('title').size().order(ascending=False) movies_stats = lens.groupby('title').agg({'rating': [np.size, np.mean]}).sort([('rating', 'mean')], ascending=False) movies_stats.head() atleast_100 = movies_stats['rating'].size >= 100 movies_stats[atleast_100].head() users.age.hist(bins=30) most_50 = lens.groupby('movie_id').size().order(ascending=False)[:50] pivoted = lens.pivot_table(rows=['movie_id', 'title'], cols=['sex'], values='rating', fill_value=0) pivoted.head() pivoted['diff'] = pivoted.M - pivoted.F pivoted.head() pivoted.reset_index('movie_id', inplace=True) disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff'] disagreements.order().plot(kind='barh', figsize=[9, 15])