stdlib!¶

We're going to import the data, including our own type converter:

In [1]:

import csv

conversion_map = {
    'carat': float,
    'depth': float,
    'price': int,
    'table': float,
    'x': float,
    'y': float,
    'z': float
}
def converter(type_map, row):
    """Yep, we need to roll our own type conversions."""
    converted_row = {}
    for col, val in row.items():
        converter = type_map.get(col)
        if converter:
            converted_row[col] = converter(val)
        else:
            converted_row[col] = val
    return converted_row

with open('diamonds.csv', 'r') as f:
    reader = csv.DictReader(f)
    diamonds = [converter(conversion_map, r) for r in reader]

In [2]:

diamonds[:2]

Out[2]:

[{'': '1',
  'carat': 0.23,
  'clarity': 'SI2',
  'color': 'E',
  'cut': 'Ideal',
  'depth': 61.5,
  'price': 326,
  'table': 55.0,
  'x': 3.95,
  'y': 3.98,
  'z': 2.43},
 {'': '2',
  'carat': 0.21,
  'clarity': 'SI1',
  'color': 'E',
  'cut': 'Premium',
  'depth': 59.8,
  'price': 326,
  'table': 61.0,
  'x': 3.89,
  'y': 3.84,
  'z': 2.31}]

Adding things up is easy enough...

In [3]:

def get_total_carats():
    total_carats = 0
    for row in diamonds:
        total_carats += row['carat']
    return total_carats
get_total_carats()

Out[3]:

43040.86999999912

In [4]:

# Faster/more compact: Generator expression!
sum(row['carat'] for row in diamonds)

Out[4]:

43040.86999999912

In [5]:

# Which is faster? 
%timeit get_total_carats()

100 loops, best of 3: 5.29 ms per loop

In [6]:

%timeit sum(row['carat'] for row in diamonds)

100 loops, best of 3: 5.7 ms per loop

But what if we want to group, then add?

In [7]:

# defaultdict is awesome. defaultdict is awesome.
from collections import defaultdict

def grouper(grouping_col, seq):
    """People have definitely written a faster version than what I'm about to write"""
    groups = defaultdict(lambda: defaultdict(list))
    for row in seq:
        group = groups[row[grouping_col]]
        for k, v in row.items():
            if k != grouping_col:
                group[k].append(v)
    return groups

In [8]:

groups = grouper('cut', diamonds)

In [9]:

# check that the groups include all the rows from the original data
assert sum(len(g['price']) for g in groups.values()) == len(diamonds)

In [10]:

set(groups)

Out[10]:

{'Fair', 'Good', 'Ideal', 'Premium', 'Very Good'}

What if I wanted to do something like

select cut, mean(price)
from diamonds
group by cut;

In [11]:

summary = {}
for group, values in groups.items():
    summary[group] = sum(values['price']) / len(values['price'])
summary

Out[11]:

{'Fair': 4358.757763975155,
 'Good': 3928.864451691806,
 'Ideal': 3457.541970210199,
 'Premium': 4584.2577042999055,
 'Very Good': 3981.7598907465654}

or how about this one:

select max(price)
from diamonds
where carat > 1;

In [11]:

def get_max_price():
    max_price = 0
    for row in diamonds:
        if row['carat'] > 1 and row['price'] > max_price:
            max_price = row['price']
    return max_price
get_max_price()

Out[11]:

In [12]:

# More compact yet again: generator expression!
max(row['price'] for row in diamonds if row['carat'] > 1)

Out[12]:

In [13]:

# Which is faster?
%timeit get_max_price()

100 loops, best of 3: 6.82 ms per loop

In [14]:

%timeit max(row['price'] for row in diamonds if row['carat'] > 1)

100 loops, best of 3: 7.36 ms per loop

itertools interlude¶

The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing.

Toolz!¶

Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html

In [15]:

import toolz as tz

Some quick toolz fun things:

In [16]:

list(tz.concat(([1, 2, 3], (4, 5, 6))))

Out[16]:

[1, 2, 3, 4, 5, 6]

In [17]:

list(tz.mapcat(lambda r: [x + "-foo" for x in r],
                          [["A", "B"], ("c", "d"), ("bar", "baz")]))

Out[17]:

['A-foo', 'B-foo', 'c-foo', 'd-foo', 'bar-foo', 'baz-foo']

In [18]:

tz.frequencies([r['color'] for r in diamonds])

Out[18]:

{'D': 6775, 'E': 9797, 'F': 9542, 'G': 11292, 'H': 8304, 'I': 5422, 'J': 2808}

Remember that toolz is lazy- functions will return generator-like things:

In [19]:

take_2 = tz.take(2, diamonds)
take_2

Out[19]:

<itertools.islice at 0x11176f260>

In [20]:

unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity'))
unique_clarity

Out[20]:

<generator object unique at 0x11176b4b0>

In [21]:

list(take_2)

Out[21]:

[{'': '1',
  'carat': 0.23,
  'clarity': 'SI2',
  'color': 'E',
  'cut': 'Ideal',
  'depth': 61.5,
  'price': 326,
  'table': 55.0,
  'x': 3.95,
  'y': 3.98,
  'z': 2.43},
 {'': '2',
  'carat': 0.21,
  'clarity': 'SI1',
  'color': 'E',
  'cut': 'Premium',
  'depth': 59.8,
  'price': 326,
  'table': 61.0,
  'x': 3.89,
  'y': 3.84,
  'z': 2.31}]

In [22]:

# Note that this returns the entire object
list(unique_clarity)

Out[22]:

[{'': '1',
  'carat': 0.23,
  'clarity': 'SI2',
  'color': 'E',
  'cut': 'Ideal',
  'depth': 61.5,
  'price': 326,
  'table': 55.0,
  'x': 3.95,
  'y': 3.98,
  'z': 2.43},
 {'': '2',
  'carat': 0.21,
  'clarity': 'SI1',
  'color': 'E',
  'cut': 'Premium',
  'depth': 59.8,
  'price': 326,
  'table': 61.0,
  'x': 3.89,
  'y': 3.84,
  'z': 2.31},
 {'': '3',
  'carat': 0.23,
  'clarity': 'VS1',
  'color': 'E',
  'cut': 'Good',
  'depth': 56.9,
  'price': 327,
  'table': 65.0,
  'x': 4.05,
  'y': 4.07,
  'z': 2.31},
 {'': '4',
  'carat': 0.29,
  'clarity': 'VS2',
  'color': 'I',
  'cut': 'Premium',
  'depth': 62.4,
  'price': 334,
  'table': 58.0,
  'x': 4.2,
  'y': 4.23,
  'z': 2.63},
 {'': '6',
  'carat': 0.24,
  'clarity': 'VVS2',
  'color': 'J',
  'cut': 'Very Good',
  'depth': 62.8,
  'price': 336,
  'table': 57.0,
  'x': 3.94,
  'y': 3.96,
  'z': 2.48},
 {'': '7',
  'carat': 0.24,
  'clarity': 'VVS1',
  'color': 'I',
  'cut': 'Very Good',
  'depth': 62.3,
  'price': 336,
  'table': 57.0,
  'x': 3.95,
  'y': 3.98,
  'z': 2.47},
 {'': '16',
  'carat': 0.32,
  'clarity': 'I1',
  'color': 'E',
  'cut': 'Premium',
  'depth': 60.9,
  'price': 345,
  'table': 58.0,
  'x': 4.38,
  'y': 4.42,
  'z': 2.68},
 {'': '230',
  'carat': 0.52,
  'clarity': 'IF',
  'color': 'F',
  'cut': 'Ideal',
  'depth': 62.2,
  'price': 2783,
  'table': 55.0,
  'x': 5.14,
  'y': 5.18,
  'z': 3.21}]

In [23]:

# What are our clarity counts?
tz.countby(lambda x: x['clarity'], diamonds)

Out[23]:

{'I1': 741,
 'IF': 1790,
 'SI1': 13065,
 'SI2': 9194,
 'VS1': 8171,
 'VS2': 12258,
 'VVS1': 3655,
 'VVS2': 5066}

In [24]:

# What about our max price from above? Reduction!
def comparo(accum, row):
    price = row['price']
    if price > accum:
        return price
    else:
        return accum
    
tz.reduce(comparo, diamonds, 0)

Out[24]:

In [25]:

# We could have also threaded here
tz.thread_last(diamonds, 
               (tz.map, lambda x: x['price']), 
               max)

Out[25]:

In [26]:

# Which is faster?
%timeit tz.reduce(comparo, diamonds, 0)

100 loops, best of 3: 8.6 ms per loop

In [27]:

%timeit tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)

100 loops, best of 3: 11.2 ms per loop

Let's look at another SQL query- say we want clarity and carats where price > 1000:

select count(1)
from diamonds
where price > 1000
group by clarity;

In [28]:

# Toolz has currying!
import toolz.curried as tzc
tzc.pipe(diamonds, 
         tzc.filter(lambda r: r['price'] > 1000),
         tzc.map(lambda r: (r['clarity'],)),
         tzc.countby(lambda r: r[0]),
         dict)

Out[28]:

{'I1': 675,
 'IF': 1042,
 'SI1': 9978,
 'SI2': 8118,
 'VS1': 5702,
 'VS2': 8647,
 'VVS1': 2108,
 'VVS2': 3146}

In [29]:

# We can go about this another way as well:
def filter_and_count(kv):
    f_and_c = tz.thread_last(kv[1],
                             (tz.filter, lambda r: r['price'] > 1000),
                             tz.count)
                                          
    return kv[0], f_and_c

tz.thread_last(diamonds,
               (tz.groupby, 'clarity'),
               (tz.itemmap, filter_and_count))

Out[29]:

{'I1': 675,
 'IF': 1042,
 'SI1': 9978,
 'SI2': 8118,
 'VS1': 5702,
 'VS2': 8647,
 'VVS1': 2108,
 'VVS2': 3146}

In [30]:

# Cleanest/best way: reduceby: Groupby + reduce
def increment(accum, row):
    if row['price'] > 1000:
        return accum + 1
    else:
        return accum

tz.reduceby('clarity', 
            increment,
            diamonds, 0)                               

Out[30]:

{'I1': 675,
 'IF': 1042,
 'SI1': 9978,
 'SI2': 8118,
 'VS1': 5702,
 'VS2': 8647,
 'VVS1': 2108,
 'VVS2': 3146}

Pandas!¶

In [31]:

import pandas as pd
# We don't need this to use Pandas, FYI
import numpy as np

# CSV reader is fast!
df = pd.read_csv('diamonds.csv', index_col=0)
# Keep this for later, we're going to overwrite df
df_diamonds = df

In [32]:

df.head()

Out[32]:

	carat	cut	color	clarity	depth	table	price	x	y	z
1	0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
2	0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
3	0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
4	0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
5	0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75

In [33]:

df.describe()

Out[33]:

	carat	depth	table	price	x	y	z
count	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000
mean	0.797940	61.749405	57.457184	3932.799722	5.731157	5.734526	3.538734
std	0.474011	1.432621	2.234491	3989.439738	1.121761	1.142135	0.705699
min	0.200000	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	0.400000	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	0.700000	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	1.040000	62.500000	59.000000	5324.250000	6.540000	6.540000	4.040000
max	5.010000	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

In [34]:

df.groupby('clarity').mean()

Out[34]:

	carat	depth	table	price	x	y	z
clarity
I1	1.283846	62.734278	58.303779	3924.168691	6.761093	6.709379	4.207908
IF	0.505123	61.510615	56.507207	2864.839106	4.968402	4.989827	3.061659
SI1	0.850482	61.853042	57.662541	3996.001148	5.888383	5.888256	3.639845
SI2	1.077648	61.772167	57.927181	5063.028606	6.401370	6.397826	3.948478
VS1	0.727158	61.667458	57.315151	3839.455391	5.572178	5.581828	3.441007
VS2	0.763935	61.724417	57.417401	3924.989395	5.657709	5.658859	3.491478
VVS1	0.503321	61.624651	56.884460	2523.114637	4.960364	4.975075	3.061294
VVS2	0.596202	61.663778	57.024990	3283.737071	5.218454	5.232118	3.221465

Our previous queries:

select cut, mean(price)
from diamonds
group by cut;

select count(carat)
from diamonds
where price > 1000
group by clarity;

select max(price)
from diamonds
where carat > 1;

select cut, price
from diamonds
where cut in ('Ideal', 'Premium')
order by price desc
limit 10;

Are pretty trivial operations in Pandas:

In [35]:

df.groupby('cut')['price'].mean()

Out[35]:

cut
Fair         4358.757764
Good         3928.864452
Ideal        3457.541970
Premium      4584.257704
Very Good    3981.759891
Name: price, dtype: float64

In [36]:

df[df['price'] > 1000].groupby('clarity')['carat'].count()

Out[36]:

clarity
I1       675
IF      1042
SI1     9978
SI2     8118
VS1     5702
VS2     8647
VVS1    2108
VVS2    3146
Name: carat, dtype: int64

In [37]:

df[df['carat'] > 1]['price'].max()

Out[37]:

In [38]:

df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]

Out[38]:

	carat	cut	color	clarity	depth	table	price	x	y	z
27750	2.29	Premium	I	VS2	60.8	60	18823	8.50	8.47	5.16
27748	1.51	Ideal	G	IF	61.7	55	18806	7.37	7.41	4.56
27747	2.07	Ideal	G	SI2	62.5	55	18804	8.20	8.13	5.11
27745	2.29	Premium	I	SI1	61.8	59	18797	8.52	8.45	5.24
27744	2.00	Premium	I	VS1	60.8	59	18795	8.13	8.02	4.91
27743	2.04	Premium	H	SI1	58.1	60	18795	8.37	8.28	4.84
27742	2.15	Ideal	G	SI2	62.6	54	18791	8.29	8.35	5.21
27741	1.71	Premium	F	VS2	62.3	59	18791	7.57	7.53	4.70
27739	2.05	Ideal	G	SI1	61.9	57	18787	8.10	8.16	5.03
27738	2.05	Premium	F	SI2	60.2	59	18784	8.28	8.33	5.00

In which I do a bunch of cool Pandas things without a real goal

In [39]:

# I can Transpose things!
df.T

Out[39]:

	1	2	3	4	5	6	7	8	9	10	...	53931	53932	53933	53934	53935	53936	53937	53938	53939	53940
carat	0.23	0.21	0.23	0.29	0.31	0.24	0.24	0.26	0.22	0.23	...	0.71	0.71	0.7	0.7	0.72	0.72	0.72	0.7	0.86	0.75
cut	Ideal	Premium	Good	Premium	Good	Very Good	Very Good	Very Good	Fair	Very Good	...	Premium	Premium	Very Good	Very Good	Premium	Ideal	Good	Very Good	Premium	Ideal
color	E	E	E	I	J	J	I	H	E	H	...	E	F	E	E	D	D	D	D	H	D
clarity	SI2	SI1	VS1	VS2	SI2	VVS2	VVS1	SI1	VS2	VS1	...	SI1	SI1	VS2	VS2	SI1	SI1	SI1	SI1	SI2	SI2
depth	61.5	59.8	56.9	62.4	63.3	62.8	62.3	61.9	65.1	59.4	...	60.5	59.8	60.5	61.2	62.7	60.8	63.1	62.8	61	62.2
table	55	61	65	58	58	57	57	55	61	61	...	55	62	59	59	59	57	55	60	58	55
price	326	326	327	334	335	336	336	337	337	338	...	2756	2756	2757	2757	2757	2757	2757	2757	2757	2757
x	3.95	3.89	4.05	4.2	4.34	3.94	3.95	4.07	3.87	4	...	5.79	5.74	5.71	5.69	5.69	5.75	5.69	5.66	6.15	5.83
y	3.98	3.84	4.07	4.23	4.35	3.96	3.98	4.11	3.78	4.05	...	5.74	5.73	5.76	5.72	5.73	5.76	5.75	5.68	6.12	5.87
z	2.43	2.31	2.31	2.63	2.75	2.48	2.47	2.53	2.49	2.39	...	3.49	3.43	3.47	3.49	3.58	3.5	3.61	3.56	3.74	3.64

10 rows × 53940 columns

In [40]:

# SORT ALL THE THINGS!
df.sort(['price', 'carat'], ascending=False).head()

Out[40]:

	carat	cut	color	clarity	depth	table	price	x	y	z
27750	2.29	Premium	I	VS2	60.8	60	18823	8.50	8.47	5.16
27749	2.00	Very Good	G	SI1	63.5	56	18818	7.90	7.97	5.04
27748	1.51	Ideal	G	IF	61.7	55	18806	7.37	7.41	4.56
27747	2.07	Ideal	G	SI2	62.5	55	18804	8.20	8.13	5.11
27746	2.00	Very Good	H	SI1	62.8	57	18803	7.95	8.00	5.01

In [41]:

# Lets use some fake data to show off some stuff:
simple_data_1 = {"int_col": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 "str_col": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
                 "float_col": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
                 "time_col": ["2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "2015-01-05",
                              "2015-01-06", "2015-01-07", "2015-01-08", "2015-01-09", "2015-01-10"]}
my_index = ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"]
df = pd.DataFrame(simple_data_1, index=my_index)
df

Out[41]:

	float_col	int_col	str_col	time_col
a	1.5	1	a	2015-01-01
b	2.5	2	b	2015-01-02
c	3.5	3	c	2015-01-03
a	4.5	4	d	2015-01-04
b	5.5	5	e	2015-01-05
c	6.5	6	f	2015-01-06
a	7.5	7	g	2015-01-07
b	8.5	8	h	2015-01-08
c	9.5	9	i	2015-01-09
a	10.5	10	j	2015-01-10

In [42]:

# DataFrames have indices that can be gotten by label or position
df.loc['a']

Out[42]:

	float_col	int_col	str_col	time_col
a	1.5	1	a	2015-01-01
a	4.5	4	d	2015-01-04
a	7.5	7	g	2015-01-07
a	10.5	10	j	2015-01-10

In [43]:

df.iloc[5]

Out[43]:

float_col           6.5
int_col               6
str_col               f
time_col     2015-01-06
Name: c, dtype: object

In [44]:

df[2:4]

Out[44]:

	float_col	int_col	str_col	time_col
c	3.5	3	c	2015-01-03
a	4.5	4	d	2015-01-04

In [45]:

# New Column! With Missing Data!
df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0]
df

Out[45]:

	float_col	int_col	str_col	time_col	new_col
a	1.5	1	a	2015-01-01	NaN
b	2.5	2	b	2015-01-02	NaN
c	3.5	3	c	2015-01-03	1
a	4.5	4	d	2015-01-04	2
b	5.5	5	e	2015-01-05	NaN
c	6.5	6	f	2015-01-06	4
a	7.5	7	g	2015-01-07	5
b	8.5	8	h	2015-01-08	9
c	9.5	9	i	2015-01-09	NaN
a	10.5	10	j	2015-01-10	10

In [46]:

# Removing missing data!
df.dropna()

Out[46]:

	float_col	int_col	str_col	time_col	new_col
c	3.5	3	c	2015-01-03	1
a	4.5	4	d	2015-01-04	2
c	6.5	6	f	2015-01-06	4
a	7.5	7	g	2015-01-07	5
b	8.5	8	h	2015-01-08	9
a	10.5	10	j	2015-01-10	10

In [47]:

# Fill missing data!
df.fillna("FOO!")

Out[47]:

	float_col	int_col	str_col	time_col	new_col
a	1.5	1	a	2015-01-01	FOO!
b	2.5	2	b	2015-01-02	FOO!
c	3.5	3	c	2015-01-03	1
a	4.5	4	d	2015-01-04	2
b	5.5	5	e	2015-01-05	FOO!
c	6.5	6	f	2015-01-06	4
a	7.5	7	g	2015-01-07	5
b	8.5	8	h	2015-01-08	9
c	9.5	9	i	2015-01-09	FOO!
a	10.5	10	j	2015-01-10	10

In [48]:

# Backfill missing data!
df.fillna(method='bfill')

Out[48]:

	float_col	int_col	str_col	time_col	new_col
a	1.5	1	a	2015-01-01	1
b	2.5	2	b	2015-01-02	1
c	3.5	3	c	2015-01-03	1
a	4.5	4	d	2015-01-04	2
b	5.5	5	e	2015-01-05	4
c	6.5	6	f	2015-01-06	4
a	7.5	7	g	2015-01-07	5
b	8.5	8	h	2015-01-08	9
c	9.5	9	i	2015-01-09	10
a	10.5	10	j	2015-01-10	10

In [49]:

# Vectorized string methods!
df['str_col'] = ["FOO", "FoO", "Foo", "Fo o", "FOO", "fOO", "fOo", "FoO", "foO", "Foo   "]
df

Out[49]:

	float_col	int_col	str_col	time_col	new_col
a	1.5	1	FOO	2015-01-01	NaN
b	2.5	2	FoO	2015-01-02	NaN
c	3.5	3	Foo	2015-01-03	1
a	4.5	4	Fo o	2015-01-04	2
b	5.5	5	FOO	2015-01-05	NaN
c	6.5	6	fOO	2015-01-06	4
a	7.5	7	fOo	2015-01-07	5
b	8.5	8	FoO	2015-01-08	9
c	9.5	9	foO	2015-01-09	NaN
a	10.5	10	Foo	2015-01-10	10

In [50]:

df['str_col'].str.lower().str.replace(' ', '')

Out[50]:

a    foo
b    foo
c    foo
a    foo
b    foo
c    foo
a    foo
b    foo
c    foo
a    foo
Name: str_col, dtype: object

In [51]:

# Database style joins!!
left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'], 
                      'rval': [4, 5, 6, 7]})
left.merge(right, how='inner')

Out[51]:

	key	lval	rval
0	foo	1	4
1	bar	2	5

In [52]:

right.merge(left, how='outer')

Out[52]:

	key	rval	lval
0	foo	4	1
1	bar	5	2
2	baz	6	NaN
3	qux	7	NaN
4	fizz	NaN	3

In [53]:

left.merge(right, how='left')

Out[53]:

	key	lval	rval
0	foo	1	4
1	bar	2	5
2	fizz	3	NaN

In [54]:

# Lets make some random timeseries data
dates = pd.date_range('2015-03-25', periods=150, freq='H')
dates

Out[54]:

DatetimeIndex(['2015-03-25 00:00:00', '2015-03-25 01:00:00',
               '2015-03-25 02:00:00', '2015-03-25 03:00:00',
               '2015-03-25 04:00:00', '2015-03-25 05:00:00',
               '2015-03-25 06:00:00', '2015-03-25 07:00:00',
               '2015-03-25 08:00:00', '2015-03-25 09:00:00', 
               ...
               '2015-03-30 20:00:00', '2015-03-30 21:00:00',
               '2015-03-30 22:00:00', '2015-03-30 23:00:00',
               '2015-03-31 00:00:00', '2015-03-31 01:00:00',
               '2015-03-31 02:00:00', '2015-03-31 03:00:00',
               '2015-03-31 04:00:00', '2015-03-31 05:00:00'],
              dtype='datetime64[ns]', length=150, freq='H', tz=None)

In [55]:

time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=["Numeric"])
time_df.head()

Out[55]:

	Numeric
2015-03-25 00:00:00	451
2015-03-25 01:00:00	10
2015-03-25 02:00:00	18
2015-03-25 03:00:00	113
2015-03-25 04:00:00	283

In [56]:

# RESAMPLE!
time_df.resample('D', how='mean')

Out[56]:

	Numeric
2015-03-25	211.166667
2015-03-26	255.041667
2015-03-27	241.625000
2015-03-28	291.791667
2015-03-29	227.625000
2015-03-30	212.125000
2015-03-31	307.500000

In [57]:

# Convert weird date formats!
the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015']
pd.to_datetime(the_worst)

Out[57]:

DatetimeIndex(['2014-03-25 23:02:00', '2014-03-25 23:01:00',
               '2015-03-25 00:00:00'],
              dtype='datetime64[ns]', freq=None, tz=None)

In [58]:

# Shift dates!
time_df.shift(1, freq='D').head()

Out[58]:

	Numeric
2015-03-26 00:00:00	451
2015-03-26 01:00:00	10
2015-03-26 02:00:00	18
2015-03-26 03:00:00	113
2015-03-26 04:00:00	283

In [59]:

# What if I have missing dates?
missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05'])
missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=["Numeric"])
missing_df

Out[59]:

	Numeric
2015-03-25	1
2015-03-30	9
2015-04-05	4

In [60]:

missing_df.asfreq('D')

Out[60]:

	Numeric
2015-03-25	1
2015-03-26	NaN
2015-03-27	NaN
2015-03-28	NaN
2015-03-29	NaN
2015-03-30	9
2015-03-31	NaN
2015-04-01	NaN
2015-04-02	NaN
2015-04-03	NaN
2015-04-04	NaN
2015-04-05	4

In [61]:

missing_df.asfreq('D', method='pad')

Out[61]:

	Numeric
2015-03-25	1
2015-03-26	1
2015-03-27	1
2015-03-28	1
2015-03-29	1
2015-03-30	9
2015-03-31	9
2015-04-01	9
2015-04-02	9
2015-04-03	9
2015-04-04	9
2015-04-05	4

XRAY!¶

In [62]:

import xray

In [63]:

arr = np.array([[1, 2, 3, 4], 
                [10, 20, 30, 40],
                [100, 200, 300, 400]])
dim0_coords = ['a', 'b', 'c']
dim1_coords = ['foo', 'bar', 'baz', 'qux']
da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)])
da

Out[63]:

<xray.DataArray (x: 3, y: 4)>
array([[  1,   2,   3,   4],
       [ 10,  20,  30,  40],
       [100, 200, 300, 400]])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [64]:

# Arrays!
da.values

Out[64]:

array([[  1,   2,   3,   4],
       [ 10,  20,  30,  40],
       [100, 200, 300, 400]])

In [65]:

da.dims

Out[65]:

('x', 'y')

In [66]:

da.coords

Out[66]:

Coordinates:
  * x        (x) |S1 'a' 'b' 'c'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [67]:

# But with some Pandas-like powers!

# Index by slice
da[0:2]

Out[67]:

<xray.DataArray (x: 2, y: 4)>
array([[ 1,  2,  3,  4],
       [10, 20, 30, 40]])
Coordinates:
  * x        (x) |S1 'a' 'b'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [68]:

# Indexing!
da.loc['a']

Out[68]:

<xray.DataArray (y: 4)>
array([1, 2, 3, 4])
Coordinates:
    x        |S1 'a'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [69]:

da.loc['a':'c']

Out[69]:

<xray.DataArray (x: 3, y: 4)>
array([[  1,   2,   3,   4],
       [ 10,  20,  30,  40],
       [100, 200, 300, 400]])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [70]:

da.sel(x=['a', 'c'])

Out[70]:

<xray.DataArray (x: 2, y: 4)>
array([[  1,   2,   3,   4],
       [100, 200, 300, 400]])
Coordinates:
  * x        (x) |S1 'a' 'c'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [71]:

da.sel(y='foo')

Out[71]:

<xray.DataArray (x: 3)>
array([  1,  10, 100])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'
    y        |S3 'foo'

In [72]:

da.isel(x=0)

Out[72]:

<xray.DataArray (y: 4)>
array([1, 2, 3, 4])
Coordinates:
    x        |S1 'a'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [73]:

# Do numpy stuff
np.mean(da), np.sum(da)

Out[73]:

(<xray.DataArray ()>
 array(92.5), <xray.DataArray ()>
 array(1110))

In [74]:

np.sin(da)

Out[74]:

<xray.DataArray (x: 3, y: 4)>
array([[ 0.84147098,  0.90929743,  0.14112001, -0.7568025 ],
       [-0.54402111,  0.91294525, -0.98803162,  0.74511316],
       [-0.50636564, -0.8732973 , -0.99975584, -0.85091936]])
Coordinates:
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'
  * x        (x) |S1 'a' 'b' 'c'

In [75]:

# Broadcast!
da + 100

Out[75]:

<xray.DataArray (x: 3, y: 4)>
array([[101, 102, 103, 104],
       [110, 120, 130, 140],
       [200, 300, 400, 500]])
Coordinates:
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'
  * x        (x) |S1 'a' 'b' 'c'

In [76]:

# Do all of the above by label
da.mean(dim='x')

Out[76]:

<xray.DataArray (y: 4)>
array([  37.,   74.,  111.,  148.])
Coordinates:
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [77]:

da.sum(dim='y')

Out[77]:

<xray.DataArray (x: 3)>
array([  10,  100, 1000])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'

In [78]:

da

Out[78]:

<xray.DataArray (x: 3, y: 4)>
array([[  1,   2,   3,   4],
       [ 10,  20,  30,  40],
       [100, 200, 300, 400]])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [79]:

# Group stuff
da.groupby('x').mean()

Out[79]:

<xray.DataArray (x: 3)>
array([   2.5,   25. ,  250. ])
Coordinates:
  * x        (x) |S1 'a' 'b' 'c'

In [80]:

da.groupby('y').sum()

Out[80]:

<xray.DataArray (y: 4)>
array([111, 222, 333, 444])
Coordinates:
  * y        (y) |S3 'foo' 'bar' 'baz' 'qux'

In [81]:

np.random.randn(2, 2, 3)

Out[81]:

array([[[ 2.51076277,  1.0696244 ,  1.33092369],
        [ 1.8807506 , -0.2231198 , -0.24000785]],

       [[-0.67361403, -0.21962662,  0.11517111],
        [-0.79717326,  0.50143901,  0.08006242]]])

In [82]:

# DataSets
x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]])
z_coords = np.array(["10s", "20s"])
data_cube = np.array([[[10, 10, 10],
                       [10, 10, 10],
                       [10, 10, 10]],
                      [[20, 20, 20],
                       [20, 20, 20],
                       [20, 20, 20]]])
ds = xray.Dataset({"cube": (["z", "x", "y"], data_cube)},
                  coords={"z": z_coords, 
                          "x": ["col_1", "col_2", "col_3"],
                          "y": ["row_1", "row_2", "row_3"]})
ds

Out[82]:

<xray.Dataset>
Dimensions:  (x: 3, y: 3, z: 2)
Coordinates:
  * y        (y) |S5 'row_1' 'row_2' 'row_3'
  * x        (x) |S5 'col_1' 'col_2' 'col_3'
  * z        (z) |S3 '10s' '20s'
Data variables:
    cube     (z, x, y) int64 10 10 10 10 10 10 10 10 10 20 20 20 20 20 20 20 ...

In [83]:

ds.sum(dim="z")

Out[83]:

<xray.Dataset>
Dimensions:  (x: 3, y: 3)
Coordinates:
  * y        (y) |S5 'row_1' 'row_2' 'row_3'
  * x        (x) |S5 'col_1' 'col_2' 'col_3'
Data variables:
    cube     (x, y) int64 30 30 30 30 30 30 30 30 30

In [84]:

ds.mean(dim="x")

Out[84]:

<xray.Dataset>
Dimensions:  (y: 3, z: 2)
Coordinates:
  * y        (y) |S5 'row_1' 'row_2' 'row_3'
  * z        (z) |S3 '10s' '20s'
Data variables:
    cube     (z, y) float64 10.0 10.0 10.0 20.0 20.0 20.0

In [85]:

ds.groupby("x").sum()

Out[85]:

<xray.Dataset>
Dimensions:  (x: 3)
Coordinates:
  * x        (x) |S5 'col_1' 'col_2' 'col_3'
Data variables:
    cube     (x) int64 90 90 90

Blaze¶

For this demo, we're going to focus on a couple queries we used in the Pandas demo:

select cut, mean(price)
from diamonds
groupby cut;

select count(carat)
from diamonds
where price > 1000
group by clarity;

In [86]:

import blaze as bz


bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds))
type(bz_diamonds)

Out[86]:

blaze.expr.expressions.Symbol

In [93]:

mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean())
clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity,
                      count=bz_diamonds.carat.count())

In [88]:

# We haven't actually computed anything yet!
# Let's make Pandas compute it. 
bz.compute(mean_price, df_diamonds)

Out[88]:

	cut	price
0	Fair	4358.757764
1	Good	3928.864452
2	Ideal	3457.541970
3	Premium	4584.257704
4	Very Good	3981.759891

In [94]:

bz.compute(clarity_count, df_diamonds)

Out[94]:

	clarity	count
0	I1	741
1	IF	1790
2	SI1	13065
3	SI2	9194
4	VS1	8171
5	VS2	12258
6	VVS1	3655
7	VVS2	5066

Ok, so what? You made Pandas do a thing we already did.

Oh Yeah, what if we want Postgres to compute the exact same expressions?¶

WAT¶

In [95]:

# Blaze/Odo make it easy to move data between containers
# Note that we have an empty table already created
pg_datasource = bz.odo(df_diamonds, "postgresql://postgres:postgres@localhost/pydata::diamonds")

In [97]:

# Now we're going to use Postgres as our computation engine
result = bz.compute(clarity_count, pg_datasource)
result

Out[97]:

<sqlalchemy.sql.selectable.Select at 0x11c27e610; Select object>

In [99]:

# I don't want a selectable. I want a DataFrame
# odo again
bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame)

Out[99]:

	clarity	count
0	IF	1790
1	I1	741
2	VVS1	3655
3	VS2	12258
4	VS1	8171
5	VVS2	5066
6	SI2	9194
7	SI1	13065

You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays...

bcolz!¶

In [100]:

import bcolz

dc = bcolz.ctable.fromdataframe(df_diamonds)

In [101]:

dc.cols

Out[101]:

carat : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 0.23  0.21  0.23 ...,  0.7   0.86  0.75]
cut : carray((53940,), |S9)
  nbytes: 474.08 KB; cbytes: 315.96 KB; ratio: 1.50
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
['Ideal' 'Premium' 'Good' ..., 'Very Good' 'Premium' 'Ideal']
color : carray((53940,), |S1)
  nbytes: 52.68 KB; cbytes: 128.00 KB; ratio: 0.41
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
['E' 'E' 'E' ..., 'D' 'H' 'D']
clarity : carray((53940,), |S4)
  nbytes: 210.70 KB; cbytes: 256.00 KB; ratio: 0.82
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
['SI2' 'SI1' 'VS1' ..., 'SI1' 'SI2' 'SI2']
depth : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 407.65 KB; ratio: 1.03
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 61.5  59.8  56.9 ...,  62.8  61.   62.2]
table : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 317.15 KB; ratio: 1.33
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 55.  61.  65. ...,  60.  58.  55.]
price : carray((53940,), int64)
  nbytes: 421.41 KB; cbytes: 279.99 KB; ratio: 1.51
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 326  326  327 ..., 2757 2757 2757]
x : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 3.95  3.89  4.05 ...,  5.66  6.15  5.83]
y : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 467.91 KB; ratio: 0.90
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 3.98  3.84  4.07 ...,  5.68  6.12  5.87]
z : carray((53940,), float64)
  nbytes: 421.41 KB; cbytes: 468.20 KB; ratio: 0.90
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ 2.43  2.31  2.31 ...,  3.56  3.74  3.64]

In [102]:

dsize = dc.cbytes / 2**20.
print("Total size for the ctable: {} MB".format(dsize))
print("Compression ratio the ctable: {}".format((dc.nbytes / float(dc.cbytes))))

Total size for the ctable: 3.49322795868 MB
Compression ratio the ctable: 1.03081835096

In [103]:

# You can do DataFrame-like stuff
dc["cut == 'Premium'"]

Out[103]:

array([(0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31),
       (0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63),
       (0.22, 'Premium', 'F', 'SI1', 60.4, 61.0, 342, 3.88, 3.84, 2.33),
       ...,
       (0.71, 'Premium', 'F', 'SI1', 59.8, 62.0, 2756, 5.74, 5.73, 3.43),
       (0.72, 'Premium', 'D', 'SI1', 62.7, 59.0, 2757, 5.69, 5.73, 3.58),
       (0.86, 'Premium', 'H', 'SI2', 61.0, 58.0, 2757, 6.15, 6.12, 3.74)], 
      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])

In [104]:

dc["(cut == 'Ideal') & (price > 1)"]

Out[104]:

array([(0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 326, 3.95, 3.98, 2.43),
       (0.23, 'Ideal', 'J', 'VS1', 62.8, 56.0, 340, 3.93, 3.9, 2.46),
       (0.31, 'Ideal', 'J', 'SI2', 62.2, 54.0, 344, 4.35, 4.37, 2.71), ...,
       (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53),
       (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5),
       (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], 
      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])

In [106]:

# We can do the same thing with the ctable on disk!
diskdc = dc.copy(rootdir='diamonds')

In [107]:

diskdc["(cut == 'Ideal') & (price > 1000)"]

Out[107]:

array([(0.7, 'Ideal', 'E', 'SI1', 62.5, 57.0, 2757, 5.7, 5.72, 3.57),
       (0.7, 'Ideal', 'G', 'VS2', 61.6, 56.0, 2757, 5.7, 5.67, 3.5),
       (0.74, 'Ideal', 'G', 'SI1', 61.6, 55.0, 2760, 5.8, 5.85, 3.59), ...,
       (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53),
       (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5),
       (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], 
      dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])

In [108]:

import os

for root, dirs, files in os.walk('diamonds'):
    level = root.replace('diamonds', '').count(os.sep)
    indent = ' ' * 4 * (level)
    print('{}{}/'.format(indent, os.path.basename(root)))
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print('{}{}'.format(subindent, f))

diamonds/
    __attrs__
    __rootdirs__
    carat/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    clarity/
        __attrs__
        data/
            __0.blp
        meta/
            sizes
            storage
    color/
        __attrs__
        data/
            __0.blp
        meta/
            sizes
            storage
    cut/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    depth/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    price/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    table/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    x/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    y/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage
    z/
        __attrs__
        data/
            __0.blp
            __1.blp
        meta/
            sizes
            storage

Dask!¶

In [109]:

import dask.array as da
import dask.dataframe as dd
import dask.bag as db

d_arr = da.from_array(np.random.randn(100000), chunks=100)
d_arr

Out[109]:

dask.array<x_1, shape=(100000,), chunks=((100, 100, 100, ..., 100, 100)), dtype=float64>

In [110]:

d_arr.sum()

Out[110]:

dask.array<x_3, shape=(), chunks=(), dtype=float64>

Wait, what happened? Why didn't I get back an integer?

Because dask is lazily evaluated- we must compute it!

In [111]:

d_arr.sum().compute()

Out[111]:

224.03863985322081

In [112]:

d_arr.mean().compute()

Out[112]:

0.0022403863985322081

In [113]:

d_arr.max().compute()

Out[113]:

4.3235191229285963

In [114]:

import dask.dataframe as dd

ddf = dd.read_csv('diamonds.csv')
ddf.groupby('cut')['price'].mean().compute()

Out[114]:

cut
Fair         4358.757764
Good         3928.864452
Ideal        3457.541970
Premium      4584.257704
Very Good    3981.759891
Name: price, dtype: float64

In [115]:

ddf['price'].sum().compute()

Out[115]:

212135217

In [116]:

ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute()

Out[116]:

clarity
I1       675
IF      1042
SI1     9978
SI2     8118
VS1     5702
VS2     8647
VVS1    2108
VVS2    3146
Name: carat, dtype: int64

We're not quite feature complete yet...

In [117]:

ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-117-b0c21ae068eb> in <module>()
----> 1 ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]

/Users/robstory/src/pydataseattle2015/.env/lib/python2.7/site-packages/dask/dataframe/core.pyc in __getattr__(self, key)
    584                 return self[key]
    585             except NotImplementedError:
--> 586                 raise e
    587 
    588     def __dir__(self):

AttributeError: 'DataFrame' object has no attribute 'sort'

In [118]:

diamonds_b = db.from_sequence(diamonds)

In [119]:

tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max)

Out[119]:

In [120]:

tzc.pipe(diamonds_b, 
         tzc.filter(lambda r: r['price'] > 1000),
         tzc.map(lambda r: (r['clarity'],)),
         tzc.countby(lambda r: r[0]),
         dict)

Out[120]:

{'I1': 675,
 'IF': 1042,
 'SI1': 9978,
 'SI2': 8118,
 'VS1': 5702,
 'VS2': 8647,
 'VVS1': 2108,
 'VVS2': 3146}