This notebook will perform the process that was applied to the August's data in the [previous notebook](2. Exploring a subdataset.ipynb) to the rest of months. So, if you're just reading the project you can skip this file and open the [next notebook](4. Obtaining a sample and improving the data.ipynb).
%matplotlib inline
%config InlineBackend.figure_format='retina'
import os
import numpy as np
import pandas as pd
# VARIABLES
# Paths.
faredata_path = '../data/faredata/trip_fare_{0}.csv'
tripdata_path = '../data/tripdata/trip_data_{0}.csv'
cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'
# Tip percentage variables.
tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']
tip_perc_column_name = 'tip_perc'
# Coordinates variables.
min_x = -74.361107
min_y = 40.459518
max_x = -71.903083
max_y = 41.175342
step_x = 0.003205 # 270 meters (885 feet) approximately.
step_y = 0.002429 # 270 meters (885 feet) approximately.
# Number of maximum areas per axis in the 'area matrix'.
max_area_x = 766 # int((max_x - min_x) / step_x)
max_area_y = 294 # int((max_y - min_y) / step_y)
# Number of columns of the 'area matrix'.
num_columns = 767 # max_area_x + 1
# The area at the top right corner in the 'area matrix'.
max_area = 226264 # (max_area_y * num_columns) + max_area_x
# Minimum number of pickups or dropoffs per area.
min_area_number = 20
# Columns for creating the pickup and dropoff area attributes.
area_column_names = ['pickup_area', 'dropoff_area']
area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
# Order of the columns to apply it at before saving.
column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',
'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',
'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
# FUNCTIONS
# Function for caculating the tip percentage of a trip.
def calculate_tip_perc(row):
subtotal = row.fare_amount + row.surcharge + row.mta_tax
tip = row.tip_amount / subtotal
tip_perc = tip * 100
return pd.Series({tip_perc_column_name: tip_perc})
# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.
def get_1d_area(coor, min_coor, step_coor):
return int((coor - min_coor) / step_coor)
def get_area(area_x, area_y, columns):
return (area_y * columns) + area_x
def calculate_area(row):
pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)
pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)
pickup_area = get_area(pickup_x, pickup_y, num_columns)
dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)
dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)
dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)
return pd.Series({
area_column_names[0]: pickup_area,
area_column_names[1]: dropoff_area
})
# This auxiliary function applies another one to every row in a DataFrame for creating new columns.
def iterate_and_apply(dataframe, function, necesary_columns):
perform = True
step = 100000
start = 0
to = step
while perform:
new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)
if len(new_columns) == 0:
perform = False
else:
dataframe.update(new_columns)
new_columns = None
start += step
to += step
return dataframe
# Creating the directory for saving the cleaned files if it doesn't exist.
if not os.path.exists('../data/cleaned/'):
os.makedirs('../data/cleaned/')
# The range of files to clean.
months = range(1, 8) + range(9, 13)
for month in months:
# Cleaning 'fare' data.
data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)
payment_type = (data.payment_type == 'CRD')
fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))
surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))
mta_tax = (data.mta_tax == 0.5)
tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))
tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))
data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]
data.drop(['payment_type'], axis=1, inplace=True)
payment_type = None
fare_amount = None
surcharge = None
mta_tax = None
tip_amount = None
tolls_amount = None
# Calculating the tip percentage.
data[tip_perc_column_name] = np.nan
data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)
data.drop(['mta_tax'], axis=1, inplace=True)
tip_perc = (data.tip_perc <= 50.0)
data = data[tip_perc]
tip_perc = None
# Adding the trip data file.
tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])
for column in tripdata.columns:
data[column] = np.nan
'''
Careful! The next line use a lot of memory, a least more than 8GB.
It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:
Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them
into the data variable in a loop.
'''
data.update(tripdata)
tripdata = None
# Cleaning 'trip' data.
rate_code = (data.rate_code == 1.0)
passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))
trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))
trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))
pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))
pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))
dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))
dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))
data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude
& dropoff_latitude & dropoff_longitude]
data.drop(['rate_code'], axis=1, inplace=True)
rate_code = None
passenger_count = None
trip_time_in_secs = None
trip_distance = None
pickup_latitude = None
pickup_longitude = None
dropoff_latitude = None
dropoff_longitude = None
# Filtering the coordinates of unusal places.
for column in area_column_names:
data[column] = np.nan
data = iterate_and_apply(data, calculate_area, area_necesary_columns)
aux_pickup = data.groupby(['pickup_area']).size()
aux_pickup = aux_pickup[aux_pickup >= min_area_number]
aux_pickup = data['pickup_area'].isin(aux_pickup.index)
aux_dropoff = data.groupby(['dropoff_area']).size()
aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]
aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)
data = data[aux_pickup & aux_dropoff]
data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)
aux_pickup = None
aux_dropoff = None
# Saving the file.
data = data.reindex_axis(column_order, axis=1)
data.to_csv(cleaned_data_path.format(month), index=True)
data = None