%pylab inline
import pylab as plt
import numpy as np

from sklearn.linear_model import LinearRegression

model = LinearRegression(normalize=True)
print model.normalize

print model

x = np.array([0, 1, 2])
y = np.array([0, 1, 2])

_ = plt.plot(x, y, marker='o')

X = x[:, np.newaxis] # The input data for sklearn is 2D: (samples == 3 x features == 1)
X

model.fit(X, y) 
model.coef_

from sklearn import neighbors, datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal?
print iris.target_names[knn.predict([[3, 5, 4, 2]])]

# A plot of the sepal space and the prediction of the KNN
from helpers import plot_iris_knn
plot_iris_knn()

# Create some simple data
import numpy as np
np.random.seed(0)
X = np.random.random(size=(20, 1))
y = 3 * X.squeeze() + 2 + np.random.normal(size=20)

# Fit a linear regression to it
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(X, y)
print "Model coefficient: %.5f, and intercept: %.5f" % (model.coef_, model.intercept_)

# Plot the data and the model prediction
X_test = np.linspace(0, 1, 100)[:, np.newaxis]
y_test = model.predict(X_test)
import pylab as pl
print X.squeeze()
pl.plot(X.squeeze(), y, 'o')
pl.plot(X_test.squeeze(), y_test)

X, y = iris.data, iris.target
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_reduced = pca.transform(X)
print "Reduced dataset shape:", X_reduced.shape

import pylab as pl
pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)

print "Meaning of the 2 components:"
for component in pca.components_:
    print " + ".join("%.3f x %s" % (value, name)
                     for value, name in zip(component, iris.feature_names))

from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0) # Fixing the RNG in kmeans
k_means.fit(X_reduced)
y_pred = k_means.predict(X_reduced)

pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred)

%pylab inline
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load the text data
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
twenty_train_small = load_files('../data/twenty_newsgroups/20news-bydate-train/',
    categories=categories, charset='latin-1')
twenty_test_small = load_files('../data/twenty_newsgroups/20news-bydate-test/',
    categories=categories, charset='latin-1')

# Turn the text documents into vectors of word frequencies
vectorizer = TfidfVectorizer(min_df=2)
X_train = vectorizer.fit_transform(twenty_train_small.data)
y_train = twenty_train_small.target

# Fit a classifier on the training set
classifier = MultinomialNB().fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
    classifier.score(X_train, y_train) * 100))

# Evaluate the classifier on the testing set
X_test = vectorizer.transform(twenty_test_small.data)
y_test = twenty_test_small.target
print("Testing score: {0:.1f}%".format(
    classifier.score(X_test, y_test) * 100))

ls -l ../data/

ls -lh ../data/twenty_newsgroups/20news-bydate-train

ls -lh ../data/twenty_newsgroups/20news-bydate-train/alt.atheism/

#print(load_files.__doc__)

all_twenty_train = load_files('../data/twenty_newsgroups/20news-bydate-train/',
  charset='latin-1', random_state=42)
all_twenty_test = load_files('../data/twenty_newsgroups/20news-bydate-test/',
    charset='latin-1', random_state=42)

all_target_names = all_twenty_train.target_names
all_target_names

all_twenty_train.target

all_twenty_train.target.shape

all_twenty_test.target.shape

len(all_twenty_train.data)

type(all_twenty_train.data[0])

def display_sample(i, dataset):
    print("Class name: " + dataset.target_names[dataset.target[i]])
    print("Text content:\n")
    print(dataset.data[i])

display_sample(0, all_twenty_train)

display_sample(1, all_twenty_train)

def text_size(text, charset='iso-8859-1'):
    return len(text.encode(charset)) * 8 * 1e-6

train_size_mb = sum(text_size(text) for text in all_twenty_train.data) 
test_size_mb = sum(text_size(text) for text in all_twenty_test.data)

print("Training set size: {0} MB".format(int(train_size_mb)))
print("Testing set size: {0} MB".format(int(test_size_mb)))

train_small_size_mb = sum(text_size(text) for text in twenty_train_small.data) 
test_small_size_mb = sum(text_size(text) for text in twenty_test_small.data)

print("Training set size: {0} MB".format(int(train_small_size_mb)))
print("Testing set size: {0} MB".format(int(test_small_size_mb)))

from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer()

vectorizer = TfidfVectorizer(min_df=1)

%time X_train_small = vectorizer.fit_transform(twenty_train_small.data)

X_train_small

n_samples, n_features = X_train_small.shape

n_samples

len(twenty_train_small.data)

n_features

type(vectorizer.vocabulary_)

len(vectorizer.vocabulary_)

len(vectorizer.get_feature_names())


vectorizer.get_feature_names()[:10]

vectorizer.get_feature_names()[n_features / 2:n_features / 2 + 10]

from sklearn.decomposition import RandomizedPCA

%time X_train_small_pca = RandomizedPCA(n_components=2).fit_transform(X_train_small)

from itertools import cycle

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, c in zip(np.unique(y_train), cycle(colors)):
    pl.scatter(X_train_small_pca[y_train == i, 0],
               X_train_small_pca[y_train == i, 1],
               c=c, label=twenty_train_small.target_names[i], alpha=0.5)
    
_ = pl.legend(loc='best')

y_train_small = twenty_train_small.target

y_train_small.shape

y_train_small

X_train_small.shape[0] == y_train_small.shape[0]

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.1)
clf

clf.fit(X_train_small, y_train_small)

X_test_small = vectorizer.transform(twenty_test_small.data)
y_test_small = twenty_test_small.target

X_test_small.shape

y_test_small.shape

clf.score(X_test_small, y_test_small)

clf.score(X_train_small, y_train_small)


TfidfVectorizer()

print(TfidfVectorizer.__doc__)

analyzer = TfidfVectorizer().build_analyzer()
analyzer("I love scikit-learn: this is a cool Python lib!")

analyzer = TfidfVectorizer(
    preprocessor=lambda text: text,  # disable lowercasing
    token_pattern=ur'(?u)\b[\w-]+\b', # treat hyphen as a letter
                                      # do not exclude single letter tokens
).build_analyzer()

analyzer("I love scikit-learn: this is a cool Python lib!")