%matplotlib inline %config InlineBackend.figure_format='retina' import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from matplotlib import cm as cmap from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.preprocessing import LabelEncoder sns.set(font='sans') labelize_columns = ['medallion', 'hack_license', 'vendor_id'] interize_columns = ['pickup_month', 'pickup_weekday', 'pickup_non_working_today', 'pickup_non_working_tomorrow'] feature_columns = ['medallion', 'hack_license', 'vendor_id', 'pickup_month', 'pickup_weekday', 'pickup_day', 'pickup_time_in_mins', 'pickup_non_working_today', 'pickup_non_working_tomorrow', 'fare_amount', 'surcharge', 'tolls_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'] class_column = 'tip_label' data = pd.read_csv('../data/dataset/dataset.csv') for column in labelize_columns: real_column = data[column].values le = LabelEncoder() le.fit(real_column) labelized_column = le.transform(real_column) data[column] = labelized_column le = None real_column = None labelized_column = None for column in interize_columns: data[column] = data[column].astype(int) data_features = data[feature_columns].values data_classes = data[class_column].values cross_validation = StratifiedShuffleSplit(data_classes, n_iter=10, test_size=0.1, random_state=0) scores = [] confusion_matrices = [] for train_index, test_index in cross_validation: data_features_train, data_classes_train = data_features[train_index], data_classes[train_index] data_features_test, data_classes_test = data_features[test_index], data_classes[test_index] ''' You need at least 16GB RAM for predicting 6 classes with 256 trees. Of course, you can use a lower number, but gradually you'll notice worse performance. ''' clf = RandomForestClassifier(n_estimators=256, n_jobs=-1) clf.fit(data_features_train, data_classes_train) # Saving the scores. test_score = clf.score(data_features_test, data_classes_test) scores.append(test_score) # Saving the confusion matrices. data_classes_pred = clf.predict(data_features_test) cm = confusion_matrix(data_classes_test, data_classes_pred) confusion_matrices.append(cm) clf = None print 'Accuracy mean: ' + str(np.mean(scores)) print 'Accuracy std: ' + str(np.std(scores)) classes = [' ', '[0-10)', '[10-15)', '[15-20)', '[20-25)', '[25-30)', '[30-inf)'] first = True cm = None for cm_iter in confusion_matrices: if first: cm = cm_iter.copy() first = False else: cm = cm + cm_iter fig, axes = plt.subplots() colorbar = axes.matshow(cm, cmap=cmap.Blues) fig.colorbar(colorbar, ticks=[0, 25000, 50000, 75000, 100000, 125000, 150000, 175000, 200000, 225000, 250000]) axes.set_xlabel('Predicted class', fontsize=15) axes.set_ylabel('True class', fontsize=15) axes.set_xticklabels(classes) axes.set_yticklabels(classes) axes.tick_params(labelsize=12) tip = data.groupby('tip_perc').size() tip.index = np.floor(tip.index) ax = tip.groupby(tip.index).sum().plot(kind='bar', figsize=(15, 5)) ax.set_xlabel('floor(tip_perc)', fontsize=18) ax.set_ylabel('number of trips', fontsize=18) ax.tick_params(labelsize=12) tip = None tip_labels = ['< 20', '>= 20'] tip_ranges_by_label = [[0.0, 20.0], [20.0, 51.0]] for i, tip_label in enumerate(tip_labels): tip_mask = ((data.tip_perc >= tip_ranges_by_label[i][0]) & (data.tip_perc < tip_ranges_by_label[i][1])) data.tip_label[tip_mask] = tip_label tip_mask = None data.to_csv('../data/dataset/dataset.csv', index=False)