from blaze import * d = Data('trip_data_*.csv') expr = by(d.passenger_count, avg_distance=d.trip_distance.mean(), count=d.passenger_count.count()) %time _ = compute(expr) d = Data('trip_data.bcolz') expr = by(d.passenger_count, avg_distance=d.trip_distance.mean(), count=d.passenger_count.count()) %time _ = compute(expr) import multiprocessing pool = multiprocessing.Pool(4) %time _ = compute(expr, map=pool.map) ds = dshape("""var * { medallion: string[32, 'ascii'], hack_license: string[32, 'ascii'], vendor_id: string[3, 'ascii'], rate_code: int32, store_and_fwd_flag: string[1, 'ascii'], pickup_datetime: datetime, dropoff_datetime: datetime, passenger_count: int32, trip_time_in_secs: int32, trip_distance: float64, pickup_longitude: float64, pickup_latitude: float64, dropoff_longitude: float64, dropoff_latitude: float64 }""") # Drop old version drop('trip_data.bcolz') # Migrate data %time into('trip_data.bcolz', 'trip_data_*.csv', dshape=ds) d = Data('trip_data_*.csv') discover(d)