mu, sigma = 0, 0.1

s = np.random.normal(mu, sigma, 10000)

cpython_s = list(s)

%timeit sorted(cpython_s)

%timeit s.sort()

count, bins, _ = hist(s, bins=30, normed=True)

def probability_density(bins, mu, sigma):
    return 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2))

count, bins, _ = hist(s, bins=30, normed=True)
plot(bins, probability_density(bins, mu, sigma), "r")

from scipy.misc import lena
from scipy.ndimage.filters import sobel
imshow(lena(), cmap=cm.gray)

imshow(sobel(lena()), cmap=cm.gray)

from sklearn import linear_model, datasets 
iris = datasets.load_iris()

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

plt.figure(2)
plt.clf()

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1)
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y,
           cmap=plt.cm.Paired)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()

iris

from sklearn.cross_validation import train_test_split

iris_full = np.c_[iris.data, iris.target] 


iris_train, iris_test = train_test_split(iris_full, test_size=10)

len(iris_train), len(iris_test)

logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(iris_train[:,:-1], iris_train[:,-1])

logreg.predict(iris_test[:, :-1])

iris_test[:, -1]

X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

h = .02  # step size in the mesh

logreg = linear_model.LogisticRegression(C=1e5)

# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

import pandas as pd
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)

users

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5))

movies

ratings

movie_ratings = pd.merge(movies, ratings, on="movie_id")  # if `on` is None, it can be inferred! 

lens = pd.merge(movie_ratings, users)

lens.groupby('title').size().order(ascending=False) 

movies_stats = lens.groupby('title').agg({'rating': [np.size, np.mean]}).sort([('rating', 'mean')], ascending=False)
movies_stats.head()

atleast_100 = movies_stats['rating'].size >= 100 

movies_stats[atleast_100].head()

users.age.hist(bins=30)

most_50 = lens.groupby('movie_id').size().order(ascending=False)[:50]
pivoted = lens.pivot_table(rows=['movie_id', 'title'],
                           cols=['sex'],
                           values='rating',
                           fill_value=0)
pivoted.head()

pivoted['diff'] = pivoted.M - pivoted.F
pivoted.head()

pivoted.reset_index('movie_id', inplace=True) 
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']  

disagreements.order().plot(kind='barh', figsize=[9, 15])