#!/usr/bin/env python
# coding: utf-8

# # stdlib!
# 
# We're going to import the data, including our own type converter:

# In[1]:


import csv

conversion_map = {
    'carat': float,
    'depth': float,
    'price': int,
    'table': float,
    'x': float,
    'y': float,
    'z': float
}
def converter(type_map, row):
    """Yep, we need to roll our own type conversions."""
    converted_row = {}
    for col, val in row.items():
        converter = type_map.get(col)
        if converter:
            converted_row[col] = converter(val)
        else:
            converted_row[col] = val
    return converted_row

with open('diamonds.csv', 'r') as f:
    reader = csv.DictReader(f)
    diamonds = [converter(conversion_map, r) for r in reader]


# In[2]:


diamonds[:2]


# Adding things up is easy enough...

# In[3]:


def get_total_carats():
    total_carats = 0
    for row in diamonds:
        total_carats += row['carat']
    return total_carats
get_total_carats()


# In[4]:


# Faster/more compact: Generator expression!
sum(row['carat'] for row in diamonds)


# In[5]:


# Which is faster? 
get_ipython().run_line_magic('timeit', 'get_total_carats()')


# In[6]:


get_ipython().run_line_magic('timeit', "sum(row['carat'] for row in diamonds)")


# But what if we want to group, then add?

# In[7]:


# defaultdict is awesome. defaultdict is awesome.
from collections import defaultdict

def grouper(grouping_col, seq):
    """People have definitely written a faster version than what I'm about to write"""
    groups = defaultdict(lambda: defaultdict(list))
    for row in seq:
        group = groups[row[grouping_col]]
        for k, v in row.items():
            if k != grouping_col:
                group[k].append(v)
    return groups


# In[8]:


groups = grouper('cut', diamonds)


# In[9]:


# check that the groups include all the rows from the original data
assert sum(len(g['price']) for g in groups.values()) == len(diamonds)


# In[10]:


set(groups)


# What if I wanted to do something like 
# ```sql
# select cut, mean(price)
# from diamonds
# group by cut;
# ```

# In[11]:


summary = {}
for group, values in groups.items():
    summary[group] = sum(values['price']) / len(values['price'])
summary


# or how about this one:
# ```sql
# select max(price)
# from diamonds
# where carat > 1;
# ```

# In[11]:


def get_max_price():
    max_price = 0
    for row in diamonds:
        if row['carat'] > 1 and row['price'] > max_price:
            max_price = row['price']
    return max_price
get_max_price()


# In[12]:


# More compact yet again: generator expression!
max(row['price'] for row in diamonds if row['carat'] > 1)


# In[13]:


# Which is faster?
get_ipython().run_line_magic('timeit', 'get_max_price()')


# In[14]:


get_ipython().run_line_magic('timeit', "max(row['price'] for row in diamonds if row['carat'] > 1)")


# ### itertools interlude
# 
# The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing. 

# # Toolz!
# 
# Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html

# In[15]:


import toolz as tz


# Some quick toolz fun things:

# In[16]:


list(tz.concat(([1, 2, 3], (4, 5, 6))))


# In[17]:


list(tz.mapcat(lambda r: [x + "-foo" for x in r],
                          [["A", "B"], ("c", "d"), ("bar", "baz")]))


# In[18]:


tz.frequencies([r['color'] for r in diamonds])


# Remember that toolz is lazy- functions will return generator-like things:

# In[19]:


take_2 = tz.take(2, diamonds)
take_2


# In[20]:


unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity'))
unique_clarity


# In[21]:


list(take_2)


# In[22]:


# Note that this returns the entire object
list(unique_clarity)


# In[23]:


# What are our clarity counts?
tz.countby(lambda x: x['clarity'], diamonds)


# In[24]:


# What about our max price from above? Reduction!
def comparo(accum, row):
    price = row['price']
    if price > accum:
        return price
    else:
        return accum
    
tz.reduce(comparo, diamonds, 0)


# In[25]:


# We could have also threaded here
tz.thread_last(diamonds, 
               (tz.map, lambda x: x['price']), 
               max)


# In[26]:


# Which is faster?
get_ipython().run_line_magic('timeit', 'tz.reduce(comparo, diamonds, 0)')


# In[27]:


get_ipython().run_line_magic('timeit', "tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)")


# Let's look at another SQL query- say we want clarity and carats where price > 1000:
# ```sql
# select count(1)
# from diamonds
# where price > 1000
# group by clarity;
# ```

# In[28]:


# Toolz has currying!
import toolz.curried as tzc
tzc.pipe(diamonds, 
         tzc.filter(lambda r: r['price'] > 1000),
         tzc.map(lambda r: (r['clarity'],)),
         tzc.countby(lambda r: r[0]),
         dict)


# In[29]:


# We can go about this another way as well:
def filter_and_count(kv):
    f_and_c = tz.thread_last(kv[1],
                             (tz.filter, lambda r: r['price'] > 1000),
                             tz.count)
                                          
    return kv[0], f_and_c

tz.thread_last(diamonds,
               (tz.groupby, 'clarity'),
               (tz.itemmap, filter_and_count))


# In[30]:


# Cleanest/best way: reduceby: Groupby + reduce
def increment(accum, row):
    if row['price'] > 1000:
        return accum + 1
    else:
        return accum

tz.reduceby('clarity', 
            increment,
            diamonds, 0)                               


# # Pandas!

# In[31]:


import pandas as pd
# We don't need this to use Pandas, FYI
import numpy as np

# CSV reader is fast!
df = pd.read_csv('diamonds.csv', index_col=0)
# Keep this for later, we're going to overwrite df
df_diamonds = df


# In[32]:


df.head()


# In[33]:


df.describe()


# In[34]:


df.groupby('clarity').mean()


# Our previous queries:
# ```sql
# select cut, mean(price)
# from diamonds
# group by cut;
# 
# select count(carat)
# from diamonds
# where price > 1000
# group by clarity;
# 
# select max(price)
# from diamonds
# where carat > 1;
# 
# select cut, price
# from diamonds
# where cut in ('Ideal', 'Premium')
# order by price desc
# limit 10;
# ```
# 
# Are pretty trivial operations in Pandas:

# In[35]:


df.groupby('cut')['price'].mean()


# In[36]:


df[df['price'] > 1000].groupby('clarity')['carat'].count()


# In[37]:


df[df['carat'] > 1]['price'].max()


# In[38]:


df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]


# In which I do a bunch of cool Pandas things without a real goal

# In[39]:


# I can Transpose things!
df.T


# In[40]:


# SORT ALL THE THINGS!
df.sort(['price', 'carat'], ascending=False).head()


# In[41]:


# Lets use some fake data to show off some stuff:
simple_data_1 = {"int_col": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 "str_col": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
                 "float_col": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
                 "time_col": ["2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "2015-01-05",
                              "2015-01-06", "2015-01-07", "2015-01-08", "2015-01-09", "2015-01-10"]}
my_index = ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"]
df = pd.DataFrame(simple_data_1, index=my_index)
df


# In[42]:


# DataFrames have indices that can be gotten by label or position
df.loc['a']


# In[43]:


df.iloc[5]


# In[44]:


df[2:4]


# In[45]:


# New Column! With Missing Data!
df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0]
df


# In[46]:


# Removing missing data!
df.dropna()


# In[47]:


# Fill missing data!
df.fillna("FOO!")


# In[48]:


# Backfill missing data!
df.fillna(method='bfill')


# In[49]:


# Vectorized string methods!
df['str_col'] = ["FOO", "FoO", "Foo", "Fo o", "FOO", "fOO", "fOo", "FoO", "foO", "Foo   "]
df


# In[50]:


df['str_col'].str.lower().str.replace(' ', '')


# In[51]:


# Database style joins!!
left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'], 
                      'rval': [4, 5, 6, 7]})
left.merge(right, how='inner')


# In[52]:


right.merge(left, how='outer')


# In[53]:


left.merge(right, how='left')


# In[54]:


# Lets make some random timeseries data
dates = pd.date_range('2015-03-25', periods=150, freq='H')
dates


# In[55]:


time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=["Numeric"])
time_df.head()


# In[56]:


# RESAMPLE!
time_df.resample('D', how='mean')


# In[57]:


# Convert weird date formats!
the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015']
pd.to_datetime(the_worst)


# In[58]:


# Shift dates!
time_df.shift(1, freq='D').head()


# In[59]:


# What if I have missing dates?
missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05'])
missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=["Numeric"])
missing_df


# In[60]:


missing_df.asfreq('D')


# In[61]:


missing_df.asfreq('D', method='pad')


# #XRAY!

# In[62]:


import xray


# In[63]:


arr = np.array([[1, 2, 3, 4], 
                [10, 20, 30, 40],
                [100, 200, 300, 400]])
dim0_coords = ['a', 'b', 'c']
dim1_coords = ['foo', 'bar', 'baz', 'qux']
da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)])
da


# In[64]:


# Arrays!
da.values


# In[65]:


da.dims


# In[66]:


da.coords


# In[67]:


# But with some Pandas-like powers!

# Index by slice
da[0:2]


# In[68]:


# Indexing!
da.loc['a']


# In[69]:


da.loc['a':'c']


# In[70]:


da.sel(x=['a', 'c'])


# In[71]:


da.sel(y='foo')


# In[72]:


da.isel(x=0)


# In[73]:


# Do numpy stuff
np.mean(da), np.sum(da)


# In[74]:


np.sin(da)


# In[75]:


# Broadcast!
da + 100


# In[76]:


# Do all of the above by label
da.mean(dim='x')


# In[77]:


da.sum(dim='y')


# In[78]:


da


# In[79]:


# Group stuff
da.groupby('x').mean()


# In[80]:


da.groupby('y').sum()


# In[81]:


np.random.randn(2, 2, 3)


# In[82]:


# DataSets
x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]])
z_coords = np.array(["10s", "20s"])
data_cube = np.array([[[10, 10, 10],
                       [10, 10, 10],
                       [10, 10, 10]],
                      [[20, 20, 20],
                       [20, 20, 20],
                       [20, 20, 20]]])
ds = xray.Dataset({"cube": (["z", "x", "y"], data_cube)},
                  coords={"z": z_coords, 
                          "x": ["col_1", "col_2", "col_3"],
                          "y": ["row_1", "row_2", "row_3"]})
ds


# In[83]:


ds.sum(dim="z")


# In[84]:


ds.mean(dim="x")


# In[85]:


ds.groupby("x").sum()


# #Blaze
# 
# For this demo, we're going to focus on a couple queries we used in the Pandas demo:
# ```sql
# select cut, mean(price)
# from diamonds
# groupby cut;
# 
# select count(carat)
# from diamonds
# where price > 1000
# group by clarity;
# ```

# In[86]:


import blaze as bz


bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds))
type(bz_diamonds)


# In[93]:


mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean())
clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity,
                      count=bz_diamonds.carat.count())


# In[88]:


# We haven't actually computed anything yet!
# Let's make Pandas compute it. 
bz.compute(mean_price, df_diamonds)


# In[94]:


bz.compute(clarity_count, df_diamonds)


# Ok, so what? You made Pandas do a thing we already did. 
# 
# ### Oh Yeah, what if we want Postgres to compute *the exact same expressions*? 
# 
# ## WAT

# In[95]:


# Blaze/Odo make it easy to move data between containers
# Note that we have an empty table already created
pg_datasource = bz.odo(df_diamonds, "postgresql://postgres:postgres@localhost/pydata::diamonds")


# In[97]:


# Now we're going to use Postgres as our computation engine
result = bz.compute(clarity_count, pg_datasource)
result


# In[99]:


# I don't want a selectable. I want a DataFrame
# odo again
bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame)


# You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays...

# # bcolz!

# In[100]:


import bcolz

dc = bcolz.ctable.fromdataframe(df_diamonds)


# In[101]:


dc.cols


# In[102]:


dsize = dc.cbytes / 2**20.
print("Total size for the ctable: {} MB".format(dsize))
print("Compression ratio the ctable: {}".format((dc.nbytes / float(dc.cbytes))))


# In[103]:


# You can do DataFrame-like stuff
dc["cut == 'Premium'"]


# In[104]:


dc["(cut == 'Ideal') & (price > 1)"]


# In[106]:


# We can do the same thing with the ctable on disk!
diskdc = dc.copy(rootdir='diamonds')


# In[107]:


diskdc["(cut == 'Ideal') & (price > 1000)"]


# In[108]:


import os

for root, dirs, files in os.walk('diamonds'):
    level = root.replace('diamonds', '').count(os.sep)
    indent = ' ' * 4 * (level)
    print('{}{}/'.format(indent, os.path.basename(root)))
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print('{}{}'.format(subindent, f))


# # Dask!

# In[109]:


import dask.array as da
import dask.dataframe as dd
import dask.bag as db

d_arr = da.from_array(np.random.randn(100000), chunks=100)
d_arr


# In[110]:


d_arr.sum()


# Wait, what happened? Why didn't I get back an integer? 
# 
# Because dask is lazily evaluated- we must `compute` it!

# In[111]:


d_arr.sum().compute()


# In[112]:


d_arr.mean().compute()


# In[113]:


d_arr.max().compute()


# In[114]:


import dask.dataframe as dd

ddf = dd.read_csv('diamonds.csv')
ddf.groupby('cut')['price'].mean().compute()


# In[115]:


ddf['price'].sum().compute()


# In[116]:


ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute()


# We're not quite feature complete yet...

# In[117]:


ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]


# In[118]:


diamonds_b = db.from_sequence(diamonds)


# In[119]:


tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max)


# In[120]:


tzc.pipe(diamonds_b, 
         tzc.filter(lambda r: r['price'] > 1000),
         tzc.map(lambda r: (r['clarity'],)),
         tzc.countby(lambda r: r[0]),
         dict)