print 'Welcome to a tutorial on Data Analysis with Pandas' print 'You can run any cell by pressing shift + enter command from your keyboard' import numpy as np import pandas import matplotlib.pyplot as plt def entries_histogram(turnstile_weather): ''' Before we perform any analysis, it might be useful to take a look at the data we're hoping to analyze. More specifically, let's examine the hourly entries in our NYC subway data and determine what distribution the data follows. This data is stored in a dataframe called turnstile_weather under the ['ENTRIESn_hourly'] column. Let's plot two histograms on the same axes to show hourly entries when raining vs. when not raining. Here's an example on how to plot histograms with pandas and matplotlib: turnstile_weather['column_to_graph'].hist() Your histograph may look similar to bar graph in the instructor notes below. You can read a bit about using matplotlib and pandas to plot histograms here: http://pandas.pydata.org/pandas-docs/stable/visualization.html#histograms You can see the information contained within the turnstile weather data here: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv ''' turnstile_weather = pandas.read_csv(turnstile_weather) plt.figure() turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1].hist(bins=20, alpha = 0.8) # code here to plot a historgram for hourly entries when it is raining turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0].hist(bins=20, alpha = 0.3) # code here to plot a historgram for hourly entries when it is not raining return plt if __name__ == '__main__': entries_histogram('data/turnstile_data_master_with_weather.csv') import numpy as np import scipy import scipy.stats import pandas def mann_whitney_plus_means(turnstile_weather): ''' This function will consume the turnstile_weather dataframe containing our final turnstile weather data. You will want to take the means and run the Mann Whitney U-test on the ENTRIESn_hourly column in the turnstile_weather dataframe. This function should return: 1) the mean of entries with rain 2) the mean of entries without rain 3) the Mann-Whitney U-statistic and p-value comparing the number of entries with rain and the number of entries without rain You should feel free to use scipy's Mann-Whitney implementation, and you might also find it useful to use numpy's mean function. Here are the functions' documentation: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html You can look at the final turnstile weather data at the link below: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv ''' turnstile_weather = pandas.read_csv(turnstile_weather) with_rain = turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1] with_rain_mean = np.mean(with_rain) without_rain = turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0] without_rain_mean = np.mean(without_rain) U, p = scipy.stats.mannwhitneyu(with_rain, without_rain) print with_rain_mean, without_rain_mean, U, p return with_rain_mean, without_rain_mean, U, p if __name__ == '__main__': mann_whitney_plus_means('data/turnstile_data_master_with_weather.csv') import numpy as np import pandas from ggplot import * def normalize_features(array): """ Normalize the features in the data set. """ array_normalized = (array-array.mean())/array.std() mu = array.mean() sigma = array.std() return array_normalized, mu, sigma def compute_cost(features, values, theta): """ Compute the cost function given a set of features / values, and the values for our thetas. """ m = len(values) sum_of_square_errors = (numpy.square(numpy.dot(features, theta) - values)).sum() cost = sum_of_square_errors/2*m return cost def gradient_descent(features, values, theta, alpha, num_iterations): """ Perform gradient descent given a data set with an arbitrary number of features. """ m = len(values) cost_history = [] for i in range(num_iterations): predicted_values = np.dot(features, theta) - values theta = theta - ((alpha/m) * np.dot(predicted_values, features)) cost = compute_cost(features, values, theta) cost_history.append(cost) return theta, pandas.Series(cost_history) def predictions(dataframe): ''' The NYC turnstile data is stored in a pandas dataframe called weather_turnstile. Using the information stored in the dataframe, let's predict the ridership of the NYC subway using linear regression with gradient descent. You can see the information contained in the turnstile weather dataframe here: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv The prediction should have a R^2 value of 0.40 or better. Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will give you a random subet (~15%) of the data contained in turnstile_data_master_with_weather.csv If you'd like to view a plot of your cost history, uncomment the call to plot_cost_history below. The slowdown from plotting is significant, so if you are timing out, the first thing to do is to comment out the plot command again. ''' dataframe = pandas.read_csv(dataframe) dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit') features = dataframe[['rain', 'precipi', 'Hour', 'meantempi']].join(dummy_units) values = dataframe[['ENTRIESn_hourly']] m = len(values) features, mu, sigma = normalize_features(features) features['ones'] = np.ones(m) features_array = np.array(features) values_array = np.array(values).flatten() #Set values for alpha, number of iterations. alpha = 0.1 # please feel free to change this value --- alpha is how long the steps be num_iterations = 75 # please feel free to change this value #Initialize theta, perform gradient descent theta_gradient_descent = np.zeros(len(features.columns)) theta_gradient_descent, cost_history = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations) plot = None plot = plot_cost_history(alpha, cost_history) predictions = np.dot(features_array, theta_gradient_descent) return predictions, plot print 'working...?' def plot_cost_history(alpha, cost_history): """This function is for viewing the plot of your cost history. You can run it by uncommenting this plot_cost_history(alpha, cost_history) call in predictions. If you want to run this locally, you should print the return value from this function. """ cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) print ggplot(cost_df, aes('Iteration', 'Cost_History')) + \ geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha ) return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \ geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha ) if __name__ == '__main__': predictions('data/turnstile_data_master_with_weather.csv') print 'Go' import numpy as np import pandas def normalize_features(array): """ Normalize the features in the data set. """ #print array array_normalized = (array-array.mean())/array.std() mu = array.mean() sigma = array.std() return array_normalized, mu, sigma def predictions2(dataframe): dataframe = pandas.read_csv(dataframe) dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit') features = dataframe[['rain', 'precipi', 'Hour', 'meantempi']].join(dummy_units) #print features values = dataframe[['ENTRIESn_hourly']] m = len(values) #print features.mean() features, mu, sigma = normalize_features(features) features['ones'] = np.ones(m) features_array = np.array(features) values_array = np.array(values).flatten() if __name__ == '__main__': predictions2('data/turnstile_data_master_with_weather.csv') import numpy as np import scipy import matplotlib.pyplot as plt import sys def compute_r_squared(data, predictions): ''' Calculate R square -- the coefficient of determination. The closer to one, the better the model Given a list of original data points, and also a list of predicted data points, write a function that will compute and return the coefficient of determination (R^2) for this data. numpy.mean() and numpy.sum() might both be useful here, but not necessary. Documentation about numpy.mean() and numpy.sum() below: http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html http://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html ''' # your code here r_squared = 1 - ( np.sum( np.square(data - predictions) )) / np.sum(np.square(data-np.mean(data) )) return r_squared