Notebook

3. Cleaning the dataset¶

This notebook will perform the process that was applied to the August's data in the [previous notebook](2. Exploring a subdataset.ipynb) to the rest of months. So, if you're just reading the project you can skip this file and open the [next notebook](4. Obtaining a sample and improving the data.ipynb).

3.1 Preparing the notebook¶

In [1]:

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:

import os

import numpy as np
import pandas as pd

In [3]:

# VARIABLES

# Paths.
faredata_path = '../data/faredata/trip_fare_{0}.csv'
tripdata_path = '../data/tripdata/trip_data_{0}.csv'
cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'


# Tip percentage variables.
tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']
tip_perc_column_name = 'tip_perc'


# Coordinates variables.
min_x = -74.361107
min_y = 40.459518

max_x = -71.903083
max_y = 41.175342

step_x = 0.003205  # 270 meters (885 feet) approximately.
step_y = 0.002429  # 270 meters (885 feet) approximately.

# Number of maximum areas per axis in the 'area matrix'.
max_area_x = 766  # int((max_x - min_x) / step_x)
max_area_y = 294  # int((max_y - min_y) / step_y)

# Number of columns of the 'area matrix'.
num_columns = 767  # max_area_x + 1

# The area at the top right corner in the 'area matrix'.
max_area = 226264  # (max_area_y * num_columns) + max_area_x

# Minimum number of pickups or dropoffs per area.
min_area_number = 20

# Columns for creating the pickup and dropoff area attributes.
area_column_names = ['pickup_area', 'dropoff_area']
area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

# Order of the columns to apply it at before saving.
column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',
                'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',
                'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

In [4]:

# FUNCTIONS

# Function for caculating the tip percentage of a trip.
def calculate_tip_perc(row):
    subtotal = row.fare_amount + row.surcharge + row.mta_tax
    tip = row.tip_amount / subtotal
    tip_perc = tip * 100
    
    return pd.Series({tip_perc_column_name: tip_perc})


# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.
def get_1d_area(coor, min_coor, step_coor):
    return int((coor - min_coor) / step_coor)

def get_area(area_x, area_y, columns):
    return (area_y * columns) + area_x

def calculate_area(row):
    pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)
    pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)
    pickup_area = get_area(pickup_x, pickup_y, num_columns)
    
    dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)
    dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)
    dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)
    
    return pd.Series({
        area_column_names[0]: pickup_area,
        area_column_names[1]: dropoff_area
    })


# This auxiliary function applies another one to every row in a DataFrame for creating new columns.
def iterate_and_apply(dataframe, function, necesary_columns):
    perform = True
    step = 100000
    start = 0
    to = step
    
    while perform:
        new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)
        if len(new_columns) == 0:
            perform = False
        else:
            dataframe.update(new_columns)
        new_columns = None
        start += step
        to += step
    
    return dataframe

In [5]:

# Creating the directory for saving the cleaned files if it doesn't exist.
if not os.path.exists('../data/cleaned/'):
    os.makedirs('../data/cleaned/')

3.2. Performing the process¶

In [6]:

# The range of files to clean.
months = range(1, 8) + range(9, 13)
for month in months:
    
    # Cleaning 'fare' data.
    
    data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)
    
    payment_type = (data.payment_type == 'CRD')
    fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))
    surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))
    mta_tax = (data.mta_tax == 0.5)
    tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))
    tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))

    data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]
    data.drop(['payment_type'], axis=1, inplace=True)
    
    payment_type = None
    fare_amount = None
    surcharge = None
    mta_tax = None
    tip_amount = None
    tolls_amount = None
    
    # Calculating the tip percentage.
    
    data[tip_perc_column_name] = np.nan
    data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)
    data.drop(['mta_tax'], axis=1, inplace=True)
    
    tip_perc = (data.tip_perc <= 50.0)
    data = data[tip_perc]
    tip_perc = None
    
    # Adding the trip data file.
    
    tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])
    
    for column in tripdata.columns:
        data[column] = np.nan
    
    '''
    Careful! The next line use a lot of memory, a least more than 8GB.
    It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:
        Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them
        into the data variable in a loop.
    '''
    data.update(tripdata)
    tripdata = None
    
    # Cleaning 'trip' data.
    
    rate_code = (data.rate_code == 1.0)
    passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))
    trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))
    trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))
    pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))
    pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))
    dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))
    dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))
    
    data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude
                & dropoff_latitude & dropoff_longitude]
    data.drop(['rate_code'], axis=1, inplace=True)
    
    rate_code = None
    passenger_count = None
    trip_time_in_secs = None
    trip_distance = None
    pickup_latitude = None
    pickup_longitude = None
    dropoff_latitude = None
    dropoff_longitude = None
    
    # Filtering the coordinates of unusal places.
    
    for column in area_column_names:
        data[column] = np.nan
    
    data = iterate_and_apply(data, calculate_area, area_necesary_columns)
    
    aux_pickup = data.groupby(['pickup_area']).size()
    aux_pickup = aux_pickup[aux_pickup >= min_area_number]
    aux_pickup = data['pickup_area'].isin(aux_pickup.index)
    
    aux_dropoff = data.groupby(['dropoff_area']).size()
    aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]
    aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)
    
    data = data[aux_pickup & aux_dropoff]
    data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)

    aux_pickup = None
    aux_dropoff = None
    
    # Saving the file.
    
    data = data.reindex_axis(column_order, axis=1)
    data.to_csv(cleaned_data_path.format(month), index=True)
    data = None