import numpy as np import pandas as pd %matplotlib inline from sklearn import cross_validation from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor df_origin = pd.read_csv("data/train.csv") df_origin.head() df_origin.tail(24) df_origin.describe() df_test = pd.read_csv('data/test.csv') df_test.head() df_test.shape df_origin.columns.values def findNan(df): cols = df.columns.values print cols for c in cols: print c, df[c].unique() findNan(df_origin) df_origin['hour'] = df_origin['datetime'].str[11:13] df_origin.hour = df_origin.hour.astype(int) df_origin.head() from datetime import datetime def func(df): i = 0 for timestamp in df['datetime']: i += 1 date_object = datetime.strptime(timestamp[:10], '%Y-%m-%d') date = datetime.date(date_object).weekday() df.loc[i-1, 'week'] = date return df df_origin = func(df_origin) df_origin.head() df_origin['year'] = df_origin['datetime'].str[:4] df_origin['month'] = df_origin['datetime'].str[5:7] df_origin['year'] = df_origin.year.astype(int) df_origin['month'] = df_origin.month.astype(int) df_origin.head() df_origin.columns.values df_clean = df_origin[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'hour', 'week', 'year', 'month']] df_clean.head() df_test['hour'] = df_test['datetime'].str[11:13] df_test.hour = df_test.hour.astype(int) df_test = func(df_test) df_test.shape df_test['year'] = df_test['datetime'].str[:4] df_test['month'] = df_test['datetime'].str[5:7] df_test['year'] = df_test.year.astype(int) df_test['month'] = df_test.month.astype(int) df_test.head() df_clean_test = df_test.drop('datetime', axis=1) df_clean_test.head() df_origin['casual'].hist() df_origin['registered'].hist() df_clean['log_cas'] = log(df_origin['casual'] + 1) df_clean['log_reg'] = log(df_origin['registered'] + 1) df_clean.head() df_clean['log_cas'].hist() df_clean['log_reg'].hist() df_clean['temp'].hist() df_clean.head() fea_cols=['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'hour', 'week', 'year'] df_clean[fea_cols].corr() clf_cal = RandomForestRegressor(n_estimators=1000, min_samples_split=11, oob_score=True) clf_cal.fit(df_clean[fea_cols].values, df_clean['log_cas'].values) pd.DataFrame(clf_cal.feature_importances_).plot(kind='bar') clf_cal.oob_score_ print clf_cal.feature_importances_ fea_cas = ['season', 'workingday', 'weather', 'temp', 'humidity', 'windspeed','hour', 'week', 'year'] clf_cal.fit(df_clean[fea_cas].values, df_clean['log_cas'].values) pd.DataFrame(clf_cal.feature_importances_).plot(kind='bar') clf_cal.oob_score_ clf_reg = RandomForestRegressor(n_estimators=1000, min_samples_split=11, oob_score=True) clf_reg.fit(df_clean[fea_cols].values, df_clean['log_reg'].values) pd.DataFrame(clf_reg.feature_importances_).plot(kind='bar') clf_reg.oob_score_ print clf_reg.feature_importances_ fea_regs=['season', 'workingday', 'weather', 'temp', 'humidity', 'hour', 'week', 'year'] clf_reg.fit(df_clean[fea_regs].values, df_clean['log_reg'].values) pd.DataFrame(clf_reg.feature_importances_).plot(kind='bar') clf_reg.oob_score_ y_pred7 = exp(clf_cal.predict(df_clean_test[fea_cas])) + exp(clf_reg.predict(df_clean_test[fea_regs])) - 2 y_pred7[:40] y_pred7 = [round(x) for x in y_pred7] df_test['count'] = y_pred7 df_test['count'] = df_test['count'].astype(int) df_test.head() df_test.shape df_test.to_csv('result15', seq=',', columns=['datetime', 'count'], header=['datetime', 'count'], index = False)