#!/usr/bin/env python # coding: utf-8 # # stdlib! # # We're going to import the data, including our own type converter: # In[1]: import csv conversion_map = { 'carat': float, 'depth': float, 'price': int, 'table': float, 'x': float, 'y': float, 'z': float } def converter(type_map, row): """Yep, we need to roll our own type conversions.""" converted_row = {} for col, val in row.items(): converter = type_map.get(col) if converter: converted_row[col] = converter(val) else: converted_row[col] = val return converted_row with open('diamonds.csv', 'r') as f: reader = csv.DictReader(f) diamonds = [converter(conversion_map, r) for r in reader] # In[2]: diamonds[:2] # Adding things up is easy enough... # In[3]: def get_total_carats(): total_carats = 0 for row in diamonds: total_carats += row['carat'] return total_carats get_total_carats() # In[4]: # Faster/more compact: Generator expression! sum(row['carat'] for row in diamonds) # In[5]: # Which is faster? get_ipython().run_line_magic('timeit', 'get_total_carats()') # In[6]: get_ipython().run_line_magic('timeit', "sum(row['carat'] for row in diamonds)") # But what if we want to group, then add? # In[7]: # defaultdict is awesome. defaultdict is awesome. from collections import defaultdict def grouper(grouping_col, seq): """People have definitely written a faster version than what I'm about to write""" groups = defaultdict(lambda: defaultdict(list)) for row in seq: group = groups[row[grouping_col]] for k, v in row.items(): if k != grouping_col: group[k].append(v) return groups # In[8]: groups = grouper('cut', diamonds) # In[9]: # check that the groups include all the rows from the original data assert sum(len(g['price']) for g in groups.values()) == len(diamonds) # In[10]: set(groups) # What if I wanted to do something like # ```sql # select cut, mean(price) # from diamonds # group by cut; # ``` # In[11]: summary = {} for group, values in groups.items(): summary[group] = sum(values['price']) / len(values['price']) summary # or how about this one: # ```sql # select max(price) # from diamonds # where carat > 1; # ``` # In[11]: def get_max_price(): max_price = 0 for row in diamonds: if row['carat'] > 1 and row['price'] > max_price: max_price = row['price'] return max_price get_max_price() # In[12]: # More compact yet again: generator expression! max(row['price'] for row in diamonds if row['carat'] > 1) # In[13]: # Which is faster? get_ipython().run_line_magic('timeit', 'get_max_price()') # In[14]: get_ipython().run_line_magic('timeit', "max(row['price'] for row in diamonds if row['carat'] > 1)") # ### itertools interlude # # The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing. # # Toolz! # # Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html # In[15]: import toolz as tz # Some quick toolz fun things: # In[16]: list(tz.concat(([1, 2, 3], (4, 5, 6)))) # In[17]: list(tz.mapcat(lambda r: [x + "-foo" for x in r], [["A", "B"], ("c", "d"), ("bar", "baz")])) # In[18]: tz.frequencies([r['color'] for r in diamonds]) # Remember that toolz is lazy- functions will return generator-like things: # In[19]: take_2 = tz.take(2, diamonds) take_2 # In[20]: unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity')) unique_clarity # In[21]: list(take_2) # In[22]: # Note that this returns the entire object list(unique_clarity) # In[23]: # What are our clarity counts? tz.countby(lambda x: x['clarity'], diamonds) # In[24]: # What about our max price from above? Reduction! def comparo(accum, row): price = row['price'] if price > accum: return price else: return accum tz.reduce(comparo, diamonds, 0) # In[25]: # We could have also threaded here tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max) # In[26]: # Which is faster? get_ipython().run_line_magic('timeit', 'tz.reduce(comparo, diamonds, 0)') # In[27]: get_ipython().run_line_magic('timeit', "tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)") # Let's look at another SQL query- say we want clarity and carats where price > 1000: # ```sql # select count(1) # from diamonds # where price > 1000 # group by clarity; # ``` # In[28]: # Toolz has currying! import toolz.curried as tzc tzc.pipe(diamonds, tzc.filter(lambda r: r['price'] > 1000), tzc.map(lambda r: (r['clarity'],)), tzc.countby(lambda r: r[0]), dict) # In[29]: # We can go about this another way as well: def filter_and_count(kv): f_and_c = tz.thread_last(kv[1], (tz.filter, lambda r: r['price'] > 1000), tz.count) return kv[0], f_and_c tz.thread_last(diamonds, (tz.groupby, 'clarity'), (tz.itemmap, filter_and_count)) # In[30]: # Cleanest/best way: reduceby: Groupby + reduce def increment(accum, row): if row['price'] > 1000: return accum + 1 else: return accum tz.reduceby('clarity', increment, diamonds, 0) # # Pandas! # In[31]: import pandas as pd # We don't need this to use Pandas, FYI import numpy as np # CSV reader is fast! df = pd.read_csv('diamonds.csv', index_col=0) # Keep this for later, we're going to overwrite df df_diamonds = df # In[32]: df.head() # In[33]: df.describe() # In[34]: df.groupby('clarity').mean() # Our previous queries: # ```sql # select cut, mean(price) # from diamonds # group by cut; # # select count(carat) # from diamonds # where price > 1000 # group by clarity; # # select max(price) # from diamonds # where carat > 1; # # select cut, price # from diamonds # where cut in ('Ideal', 'Premium') # order by price desc # limit 10; # ``` # # Are pretty trivial operations in Pandas: # In[35]: df.groupby('cut')['price'].mean() # In[36]: df[df['price'] > 1000].groupby('clarity')['carat'].count() # In[37]: df[df['carat'] > 1]['price'].max() # In[38]: df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10] # In which I do a bunch of cool Pandas things without a real goal # In[39]: # I can Transpose things! df.T # In[40]: # SORT ALL THE THINGS! df.sort(['price', 'carat'], ascending=False).head() # In[41]: # Lets use some fake data to show off some stuff: simple_data_1 = {"int_col": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "str_col": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], "float_col": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5], "time_col": ["2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "2015-01-05", "2015-01-06", "2015-01-07", "2015-01-08", "2015-01-09", "2015-01-10"]} my_index = ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"] df = pd.DataFrame(simple_data_1, index=my_index) df # In[42]: # DataFrames have indices that can be gotten by label or position df.loc['a'] # In[43]: df.iloc[5] # In[44]: df[2:4] # In[45]: # New Column! With Missing Data! df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0] df # In[46]: # Removing missing data! df.dropna() # In[47]: # Fill missing data! df.fillna("FOO!") # In[48]: # Backfill missing data! df.fillna(method='bfill') # In[49]: # Vectorized string methods! df['str_col'] = ["FOO", "FoO", "Foo", "Fo o", "FOO", "fOO", "fOo", "FoO", "foO", "Foo "] df # In[50]: df['str_col'].str.lower().str.replace(' ', '') # In[51]: # Database style joins!! left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]}) right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'], 'rval': [4, 5, 6, 7]}) left.merge(right, how='inner') # In[52]: right.merge(left, how='outer') # In[53]: left.merge(right, how='left') # In[54]: # Lets make some random timeseries data dates = pd.date_range('2015-03-25', periods=150, freq='H') dates # In[55]: time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=["Numeric"]) time_df.head() # In[56]: # RESAMPLE! time_df.resample('D', how='mean') # In[57]: # Convert weird date formats! the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015'] pd.to_datetime(the_worst) # In[58]: # Shift dates! time_df.shift(1, freq='D').head() # In[59]: # What if I have missing dates? missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05']) missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=["Numeric"]) missing_df # In[60]: missing_df.asfreq('D') # In[61]: missing_df.asfreq('D', method='pad') # #XRAY! # In[62]: import xray # In[63]: arr = np.array([[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]) dim0_coords = ['a', 'b', 'c'] dim1_coords = ['foo', 'bar', 'baz', 'qux'] da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)]) da # In[64]: # Arrays! da.values # In[65]: da.dims # In[66]: da.coords # In[67]: # But with some Pandas-like powers! # Index by slice da[0:2] # In[68]: # Indexing! da.loc['a'] # In[69]: da.loc['a':'c'] # In[70]: da.sel(x=['a', 'c']) # In[71]: da.sel(y='foo') # In[72]: da.isel(x=0) # In[73]: # Do numpy stuff np.mean(da), np.sum(da) # In[74]: np.sin(da) # In[75]: # Broadcast! da + 100 # In[76]: # Do all of the above by label da.mean(dim='x') # In[77]: da.sum(dim='y') # In[78]: da # In[79]: # Group stuff da.groupby('x').mean() # In[80]: da.groupby('y').sum() # In[81]: np.random.randn(2, 2, 3) # In[82]: # DataSets x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]]) z_coords = np.array(["10s", "20s"]) data_cube = np.array([[[10, 10, 10], [10, 10, 10], [10, 10, 10]], [[20, 20, 20], [20, 20, 20], [20, 20, 20]]]) ds = xray.Dataset({"cube": (["z", "x", "y"], data_cube)}, coords={"z": z_coords, "x": ["col_1", "col_2", "col_3"], "y": ["row_1", "row_2", "row_3"]}) ds # In[83]: ds.sum(dim="z") # In[84]: ds.mean(dim="x") # In[85]: ds.groupby("x").sum() # #Blaze # # For this demo, we're going to focus on a couple queries we used in the Pandas demo: # ```sql # select cut, mean(price) # from diamonds # groupby cut; # # select count(carat) # from diamonds # where price > 1000 # group by clarity; # ``` # In[86]: import blaze as bz bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds)) type(bz_diamonds) # In[93]: mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean()) clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity, count=bz_diamonds.carat.count()) # In[88]: # We haven't actually computed anything yet! # Let's make Pandas compute it. bz.compute(mean_price, df_diamonds) # In[94]: bz.compute(clarity_count, df_diamonds) # Ok, so what? You made Pandas do a thing we already did. # # ### Oh Yeah, what if we want Postgres to compute *the exact same expressions*? # # ## WAT # In[95]: # Blaze/Odo make it easy to move data between containers # Note that we have an empty table already created pg_datasource = bz.odo(df_diamonds, "postgresql://postgres:postgres@localhost/pydata::diamonds") # In[97]: # Now we're going to use Postgres as our computation engine result = bz.compute(clarity_count, pg_datasource) result # In[99]: # I don't want a selectable. I want a DataFrame # odo again bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame) # You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays... # # bcolz! # In[100]: import bcolz dc = bcolz.ctable.fromdataframe(df_diamonds) # In[101]: dc.cols # In[102]: dsize = dc.cbytes / 2**20. print("Total size for the ctable: {} MB".format(dsize)) print("Compression ratio the ctable: {}".format((dc.nbytes / float(dc.cbytes)))) # In[103]: # You can do DataFrame-like stuff dc["cut == 'Premium'"] # In[104]: dc["(cut == 'Ideal') & (price > 1)"] # In[106]: # We can do the same thing with the ctable on disk! diskdc = dc.copy(rootdir='diamonds') # In[107]: diskdc["(cut == 'Ideal') & (price > 1000)"] # In[108]: import os for root, dirs, files in os.walk('diamonds'): level = root.replace('diamonds', '').count(os.sep) indent = ' ' * 4 * (level) print('{}{}/'.format(indent, os.path.basename(root))) subindent = ' ' * 4 * (level + 1) for f in files: print('{}{}'.format(subindent, f)) # # Dask! # In[109]: import dask.array as da import dask.dataframe as dd import dask.bag as db d_arr = da.from_array(np.random.randn(100000), chunks=100) d_arr # In[110]: d_arr.sum() # Wait, what happened? Why didn't I get back an integer? # # Because dask is lazily evaluated- we must `compute` it! # In[111]: d_arr.sum().compute() # In[112]: d_arr.mean().compute() # In[113]: d_arr.max().compute() # In[114]: import dask.dataframe as dd ddf = dd.read_csv('diamonds.csv') ddf.groupby('cut')['price'].mean().compute() # In[115]: ddf['price'].sum().compute() # In[116]: ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute() # We're not quite feature complete yet... # In[117]: ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10] # In[118]: diamonds_b = db.from_sequence(diamonds) # In[119]: tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max) # In[120]: tzc.pipe(diamonds_b, tzc.filter(lambda r: r['price'] > 1000), tzc.map(lambda r: (r['clarity'],)), tzc.countby(lambda r: r[0]), dict)