import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
import pylab as plt
import matplotlib 
%matplotlib inline
pd.__version__

values = [5,3,4,8,2,9]
vals = pd.Series(values)
vals

vals.index

vals.values

vals * 2.5

vals2 = pd.Series(values, index=['tom','sally','jeff','george','pablo','florence'])
vals2

vals2[['florence','tom']]

vals2[['florence','tom','kate']]

vals3 = vals2[['tom','sally','pablo','florence','ricky','katrin']]
vals3

vals3.dropna()


vals3.fillna(0)

vals3.fillna(vals3.mean())

vals3.fillna(method='ffill')

vals3.describe()

vals.index=pd.Index(['tom','sally','pablo','florence','ricky','katrin'])
vals3=vals3[['tom','sally','pablo','florence','billy','katrin']]

# create a dataframe
dat = pd.DataFrame({'orig':vals,'new':vals3})
dat

dat.isnull()

dat.dropna()

hipster = pd.read_csv('hipster.csv')
hipster[:10]

hipster = hipster.set_index(pd.DatetimeIndex(hipster.pop('Date')))
hipster[:10]

not_hipster = pd.read_csv('negative-hipster.csv')
not_hipster = not_hipster.set_index(pd.DatetimeIndex(not_hipster.pop('Date')))

not_hipster[:10]

hipster.hipster.head()

hipster['gumtree perth'].values[:20]

hipster.dtypes

trend = hipster.join(not_hipster, how='inner')
trend.head()

trend.columns

trend.values

trend['2012-01-01':].head()

trend['2012-01-01': '2013-01-01'].tail(3)

trend.ix['2012-01-01', ['hipster', 'modcloth']]

trend[trend.techno < 0].head()

_ = trend.plot(figsize=(10, 6))
_ = plt.legend(loc='best', ncol=2)

_ = trend.hipster.cumsum().plot()

axs = trend.plot(subplots=True, figsize=(10, 10))

# resample by month
trend.resample('M', how='mean').head()

# resample by year
_ = trend.resample('A', how='mean').plot(figsize=(10, 10))

# look at the relations
_ = pd.scatter_matrix(trend, figsize=(12,8), diagonal='kde')

df = pd.read_csv('train.csv', header=0)

df.head()

df.dtypes

df.info()

df_grouped = df.groupby(['Pclass', 'Sex'])

df_grouped[['Age', 'Survived']].mean()

ax = df['Age'].dropna().hist(bins=20, range=(0,100), alpha = .5)
ax.set_xlabel('Age')
ax.set_ylabel('Passenger Count')

bp = df.boxplot(column='Age', by='Pclass', grid=False)
for i in set(df.Pclass):
    y = df.Age[df.Pclass==i].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y, 'r.', alpha=0.2)

df['isFemale'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
df[['Sex','isFemale']].head()

drop_cols = df.columns[df.dtypes.map(lambda x: x=='object')]
drop_cols

df.info()

X = pd.DataFrame(df[[c for c in df.columns if c != 'Survived']])
X = X.drop(drop_cols, axis=1) 
X = X.drop('PassengerId', axis=1)
y = df.Survived
print X.head()

y.groupby(y.values).count()

X['Age'] = X.Age.fillna(X.Age.median())

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acc

# create our classifier
clf = LogisticRegression()
# fit it to the data
clf.fit(X, y)
# and predict
preds = clf.predict(X)
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)
print 'Not too bad'

from sklearn.cross_validation import KFold

cv = KFold(n=len(y), n_folds=5, shuffle=True)
preds = np.zeros_like(y)
for train, test in cv:
    clf = LogisticRegression()
    clf.fit(X.ix[train], y.ix[train])
    preds[test] = clf.predict(X.ix[test])
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)


# scikits can actually take care of this for us
from sklearn.cross_validation import cross_val_score

# here
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
# to here

print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

df.Embarked.head()

set(df.Embarked.fillna('O'))

from sklearn import preprocessing
df.Embarked = df.Embarked.fillna('O')
le = preprocessing.LabelEncoder()
le.fit(df.Embarked.values)
le.classes_


X['Embarked'] = le.transform(df.Embarked.values)
X.Embarked.head()

for C in [0.001, 0.01, 0.1, 1, 10, 100]:
    clf = LogisticRegression(C=C, penalty='l1')
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')    
    print("n_estimators: {:3.3f}\tAccuracy: {:.2f} (+/- {:.2f})"
          .format(C, scores.mean(), scores.std() * 2))

# normalise the data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA",
         "QDA", "Logistic Regression"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA(),
    LogisticRegression(class_weight='auto')]


# fit each classifier and find the mean performance
res = []
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    res.append(scores.mean())

import prettyplotlib as ppl
res = np.array(res)
names = np.array(names)
idx = np.argsort(res)[::-1]
fig, ax = plt.subplots(1, figsize=(14, 6))
ppl.bar(ax, np.arange(len(res)), res[idx], annotate=True,
        xticklabels=names[idx], grid='y')
plt.xticks(rotation=30)
_ = ax.set_ylim(res.min() * 0.95, res.max() * 1.05)

# models can be saved
import pickle
s = pickle.dumps(clf)