%matplotlib inline %config InlineBackend.figure_format='retina' import os import numpy as np import pandas as pd # VARIABLES # Paths. faredata_path = '../data/faredata/trip_fare_{0}.csv' tripdata_path = '../data/tripdata/trip_data_{0}.csv' cleaned_data_path = '../data/cleaned/cleaned_{0}.csv' # Tip percentage variables. tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount'] tip_perc_column_name = 'tip_perc' # Coordinates variables. min_x = -74.361107 min_y = 40.459518 max_x = -71.903083 max_y = 41.175342 step_x = 0.003205 # 270 meters (885 feet) approximately. step_y = 0.002429 # 270 meters (885 feet) approximately. # Number of maximum areas per axis in the 'area matrix'. max_area_x = 766 # int((max_x - min_x) / step_x) max_area_y = 294 # int((max_y - min_y) / step_y) # Number of columns of the 'area matrix'. num_columns = 767 # max_area_x + 1 # The area at the top right corner in the 'area matrix'. max_area = 226264 # (max_area_y * num_columns) + max_area_x # Minimum number of pickups or dropoffs per area. min_area_number = 20 # Columns for creating the pickup and dropoff area attributes. area_column_names = ['pickup_area', 'dropoff_area'] area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'] # Order of the columns to apply it at before saving. column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount', 'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'] # FUNCTIONS # Function for caculating the tip percentage of a trip. def calculate_tip_perc(row): subtotal = row.fare_amount + row.surcharge + row.mta_tax tip = row.tip_amount / subtotal tip_perc = tip * 100 return pd.Series({tip_perc_column_name: tip_perc}) # Functions for calculating. the pickup and dropoff areas in the 'area matrix'. def get_1d_area(coor, min_coor, step_coor): return int((coor - min_coor) / step_coor) def get_area(area_x, area_y, columns): return (area_y * columns) + area_x def calculate_area(row): pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x) pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y) pickup_area = get_area(pickup_x, pickup_y, num_columns) dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x) dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y) dropoff_area = get_area(dropoff_x, dropoff_y, num_columns) return pd.Series({ area_column_names[0]: pickup_area, area_column_names[1]: dropoff_area }) # This auxiliary function applies another one to every row in a DataFrame for creating new columns. def iterate_and_apply(dataframe, function, necesary_columns): perform = True step = 100000 start = 0 to = step while perform: new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1) if len(new_columns) == 0: perform = False else: dataframe.update(new_columns) new_columns = None start += step to += step return dataframe # Creating the directory for saving the cleaned files if it doesn't exist. if not os.path.exists('../data/cleaned/'): os.makedirs('../data/cleaned/') # The range of files to clean. months = range(1, 8) + range(9, 13) for month in months: # Cleaning 'fare' data. data = pd.read_csv(faredata_path.format(month), skipinitialspace=True) payment_type = (data.payment_type == 'CRD') fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0)) surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0)) mta_tax = (data.mta_tax == 0.5) tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0)) tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0)) data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount] data.drop(['payment_type'], axis=1, inplace=True) payment_type = None fare_amount = None surcharge = None mta_tax = None tip_amount = None tolls_amount = None # Calculating the tip percentage. data[tip_perc_column_name] = np.nan data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns) data.drop(['mta_tax'], axis=1, inplace=True) tip_perc = (data.tip_perc <= 50.0) data = data[tip_perc] tip_perc = None # Adding the trip data file. tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13]) for column in tripdata.columns: data[column] = np.nan ''' Careful! The next line use a lot of memory, a least more than 8GB. It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame: Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them into the data variable in a loop. ''' data.update(tripdata) tripdata = None # Cleaning 'trip' data. rate_code = (data.rate_code == 1.0) passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0)) trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0)) trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0)) pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342)) pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083)) dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342)) dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083)) data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude & dropoff_latitude & dropoff_longitude] data.drop(['rate_code'], axis=1, inplace=True) rate_code = None passenger_count = None trip_time_in_secs = None trip_distance = None pickup_latitude = None pickup_longitude = None dropoff_latitude = None dropoff_longitude = None # Filtering the coordinates of unusal places. for column in area_column_names: data[column] = np.nan data = iterate_and_apply(data, calculate_area, area_necesary_columns) aux_pickup = data.groupby(['pickup_area']).size() aux_pickup = aux_pickup[aux_pickup >= min_area_number] aux_pickup = data['pickup_area'].isin(aux_pickup.index) aux_dropoff = data.groupby(['dropoff_area']).size() aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number] aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index) data = data[aux_pickup & aux_dropoff] data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True) aux_pickup = None aux_dropoff = None # Saving the file. data = data.reindex_axis(column_order, axis=1) data.to_csv(cleaned_data_path.format(month), index=True) data = None