import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=UserWarning) import numpy as np import pandas as pd import pylab as plt import matplotlib %matplotlib inline pd.__version__ values = [5,3,4,8,2,9] vals = pd.Series(values) vals vals.index vals.values vals * 2.5 vals2 = pd.Series(values, index=['tom','sally','jeff','george','pablo','florence']) vals2 vals2[['florence','tom']] vals2[['florence','tom','kate']] vals3 = vals2[['tom','sally','pablo','florence','ricky','katrin']] vals3 vals3.dropna() vals3.fillna(0) vals3.fillna(vals3.mean()) vals3.fillna(method='ffill') vals3.describe() vals.index=pd.Index(['tom','sally','pablo','florence','ricky','katrin']) vals3=vals3[['tom','sally','pablo','florence','billy','katrin']] # create a dataframe dat = pd.DataFrame({'orig':vals,'new':vals3}) dat dat.isnull() dat.dropna() hipster = pd.read_csv('hipster.csv') hipster[:10] hipster = hipster.set_index(pd.DatetimeIndex(hipster.pop('Date'))) hipster[:10] not_hipster = pd.read_csv('negative-hipster.csv') not_hipster = not_hipster.set_index(pd.DatetimeIndex(not_hipster.pop('Date'))) not_hipster[:10] hipster.hipster.head() hipster['gumtree perth'].values[:20] hipster.dtypes trend = hipster.join(not_hipster, how='inner') trend.head() trend.columns trend.values trend['2012-01-01':].head() trend['2012-01-01': '2013-01-01'].tail(3) trend.ix['2012-01-01', ['hipster', 'modcloth']] trend[trend.techno < 0].head() _ = trend.plot(figsize=(10, 6)) _ = plt.legend(loc='best', ncol=2) _ = trend.hipster.cumsum().plot() axs = trend.plot(subplots=True, figsize=(10, 10)) # resample by month trend.resample('M', how='mean').head() # resample by year _ = trend.resample('A', how='mean').plot(figsize=(10, 10)) # look at the relations _ = pd.scatter_matrix(trend, figsize=(12,8), diagonal='kde') df = pd.read_csv('train.csv', header=0) df.head() df.dtypes df.info() df_grouped = df.groupby(['Pclass', 'Sex']) df_grouped[['Age', 'Survived']].mean() ax = df['Age'].dropna().hist(bins=20, range=(0,100), alpha = .5) ax.set_xlabel('Age') ax.set_ylabel('Passenger Count') bp = df.boxplot(column='Age', by='Pclass', grid=False) for i in set(df.Pclass): y = df.Age[df.Pclass==i].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y, 'r.', alpha=0.2) df['isFemale'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int) df[['Sex','isFemale']].head() drop_cols = df.columns[df.dtypes.map(lambda x: x=='object')] drop_cols df.info() X = pd.DataFrame(df[[c for c in df.columns if c != 'Survived']]) X = X.drop(drop_cols, axis=1) X = X.drop('PassengerId', axis=1) y = df.Survived print X.head() y.groupby(y.values).count() X['Age'] = X.Age.fillna(X.Age.median()) from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score as acc # create our classifier clf = LogisticRegression() # fit it to the data clf.fit(X, y) # and predict preds = clf.predict(X) res_acc = acc(y, preds) print 'Accuracy Score: {:.2f}'.format(res_acc) print 'Not too bad' from sklearn.cross_validation import KFold cv = KFold(n=len(y), n_folds=5, shuffle=True) preds = np.zeros_like(y) for train, test in cv: clf = LogisticRegression() clf.fit(X.ix[train], y.ix[train]) preds[test] = clf.predict(X.ix[test]) res_acc = acc(y, preds) print 'Accuracy Score: {:.2f}'.format(res_acc) # scikits can actually take care of this for us from sklearn.cross_validation import cross_val_score # here clf = LogisticRegression() scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') # to here print scores print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) df.Embarked.head() set(df.Embarked.fillna('O')) from sklearn import preprocessing df.Embarked = df.Embarked.fillna('O') le = preprocessing.LabelEncoder() le.fit(df.Embarked.values) le.classes_ X['Embarked'] = le.transform(df.Embarked.values) X.Embarked.head() for C in [0.001, 0.01, 0.1, 1, 10, 100]: clf = LogisticRegression(C=C, penalty='l1') scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("n_estimators: {:3.3f}\tAccuracy: {:.2f} (+/- {:.2f})" .format(C, scores.mean(), scores.std() * 2)) # normalise the data from sklearn.preprocessing import StandardScaler X = StandardScaler().fit_transform(X) from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA", "Logistic Regression"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GaussianNB(), LDA(), QDA(), LogisticRegression(class_weight='auto')] # fit each classifier and find the mean performance res = [] for name, clf in zip(names, classifiers): scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') res.append(scores.mean()) import prettyplotlib as ppl res = np.array(res) names = np.array(names) idx = np.argsort(res)[::-1] fig, ax = plt.subplots(1, figsize=(14, 6)) ppl.bar(ax, np.arange(len(res)), res[idx], annotate=True, xticklabels=names[idx], grid='y') plt.xticks(rotation=30) _ = ax.set_ylim(res.min() * 0.95, res.max() * 1.05) # models can be saved import pickle s = pickle.dumps(clf)