We're going to import the data, including our own type converter:
import csv
conversion_map = {
'carat': float,
'depth': float,
'price': int,
'table': float,
'x': float,
'y': float,
'z': float
}
def converter(type_map, row):
"""Yep, we need to roll our own type conversions."""
converted_row = {}
for col, val in row.items():
converter = type_map.get(col)
if converter:
converted_row[col] = converter(val)
else:
converted_row[col] = val
return converted_row
with open('diamonds.csv', 'r') as f:
reader = csv.DictReader(f)
diamonds = [converter(conversion_map, r) for r in reader]
diamonds[:2]
[{'': '1', 'carat': 0.23, 'clarity': 'SI2', 'color': 'E', 'cut': 'Ideal', 'depth': 61.5, 'price': 326, 'table': 55.0, 'x': 3.95, 'y': 3.98, 'z': 2.43}, {'': '2', 'carat': 0.21, 'clarity': 'SI1', 'color': 'E', 'cut': 'Premium', 'depth': 59.8, 'price': 326, 'table': 61.0, 'x': 3.89, 'y': 3.84, 'z': 2.31}]
Adding things up is easy enough...
def get_total_carats():
total_carats = 0
for row in diamonds:
total_carats += row['carat']
return total_carats
get_total_carats()
43040.86999999912
# Faster/more compact: Generator expression!
sum(row['carat'] for row in diamonds)
43040.86999999912
# Which is faster?
%timeit get_total_carats()
100 loops, best of 3: 5.29 ms per loop
%timeit sum(row['carat'] for row in diamonds)
100 loops, best of 3: 5.7 ms per loop
But what if we want to group, then add?
# defaultdict is awesome. defaultdict is awesome.
from collections import defaultdict
def grouper(grouping_col, seq):
"""People have definitely written a faster version than what I'm about to write"""
groups = defaultdict(lambda: defaultdict(list))
for row in seq:
group = groups[row[grouping_col]]
for k, v in row.items():
if k != grouping_col:
group[k].append(v)
return groups
groups = grouper('cut', diamonds)
# check that the groups include all the rows from the original data
assert sum(len(g['price']) for g in groups.values()) == len(diamonds)
set(groups)
{'Fair', 'Good', 'Ideal', 'Premium', 'Very Good'}
What if I wanted to do something like
select cut, mean(price)
from diamonds
group by cut;
summary = {}
for group, values in groups.items():
summary[group] = sum(values['price']) / len(values['price'])
summary
{'Fair': 4358.757763975155, 'Good': 3928.864451691806, 'Ideal': 3457.541970210199, 'Premium': 4584.2577042999055, 'Very Good': 3981.7598907465654}
or how about this one:
select max(price)
from diamonds
where carat > 1;
def get_max_price():
max_price = 0
for row in diamonds:
if row['carat'] > 1 and row['price'] > max_price:
max_price = row['price']
return max_price
get_max_price()
18823
# More compact yet again: generator expression!
max(row['price'] for row in diamonds if row['carat'] > 1)
18823
# Which is faster?
%timeit get_max_price()
100 loops, best of 3: 6.82 ms per loop
%timeit max(row['price'] for row in diamonds if row['carat'] > 1)
100 loops, best of 3: 7.36 ms per loop
The itertools module does lots of nice things. You should be aware of it, and use it where you would be writing your own bespoke counting thing.
Lets see what it looks like to repeat some of this analysis using the toolz library. PSA: toolz has a pretty huge API, and it's worth reading through the docs: http://toolz.readthedocs.org/en/latest/api.html
import toolz as tz
Some quick toolz fun things:
list(tz.concat(([1, 2, 3], (4, 5, 6))))
[1, 2, 3, 4, 5, 6]
list(tz.mapcat(lambda r: [x + "-foo" for x in r],
[["A", "B"], ("c", "d"), ("bar", "baz")]))
['A-foo', 'B-foo', 'c-foo', 'd-foo', 'bar-foo', 'baz-foo']
tz.frequencies([r['color'] for r in diamonds])
{'D': 6775, 'E': 9797, 'F': 9542, 'G': 11292, 'H': 8304, 'I': 5422, 'J': 2808}
Remember that toolz is lazy- functions will return generator-like things:
take_2 = tz.take(2, diamonds)
take_2
<itertools.islice at 0x11176f260>
unique_clarity = tz.unique(diamonds, key=lambda x: x.get('clarity'))
unique_clarity
<generator object unique at 0x11176b4b0>
list(take_2)
[{'': '1', 'carat': 0.23, 'clarity': 'SI2', 'color': 'E', 'cut': 'Ideal', 'depth': 61.5, 'price': 326, 'table': 55.0, 'x': 3.95, 'y': 3.98, 'z': 2.43}, {'': '2', 'carat': 0.21, 'clarity': 'SI1', 'color': 'E', 'cut': 'Premium', 'depth': 59.8, 'price': 326, 'table': 61.0, 'x': 3.89, 'y': 3.84, 'z': 2.31}]
# Note that this returns the entire object
list(unique_clarity)
[{'': '1', 'carat': 0.23, 'clarity': 'SI2', 'color': 'E', 'cut': 'Ideal', 'depth': 61.5, 'price': 326, 'table': 55.0, 'x': 3.95, 'y': 3.98, 'z': 2.43}, {'': '2', 'carat': 0.21, 'clarity': 'SI1', 'color': 'E', 'cut': 'Premium', 'depth': 59.8, 'price': 326, 'table': 61.0, 'x': 3.89, 'y': 3.84, 'z': 2.31}, {'': '3', 'carat': 0.23, 'clarity': 'VS1', 'color': 'E', 'cut': 'Good', 'depth': 56.9, 'price': 327, 'table': 65.0, 'x': 4.05, 'y': 4.07, 'z': 2.31}, {'': '4', 'carat': 0.29, 'clarity': 'VS2', 'color': 'I', 'cut': 'Premium', 'depth': 62.4, 'price': 334, 'table': 58.0, 'x': 4.2, 'y': 4.23, 'z': 2.63}, {'': '6', 'carat': 0.24, 'clarity': 'VVS2', 'color': 'J', 'cut': 'Very Good', 'depth': 62.8, 'price': 336, 'table': 57.0, 'x': 3.94, 'y': 3.96, 'z': 2.48}, {'': '7', 'carat': 0.24, 'clarity': 'VVS1', 'color': 'I', 'cut': 'Very Good', 'depth': 62.3, 'price': 336, 'table': 57.0, 'x': 3.95, 'y': 3.98, 'z': 2.47}, {'': '16', 'carat': 0.32, 'clarity': 'I1', 'color': 'E', 'cut': 'Premium', 'depth': 60.9, 'price': 345, 'table': 58.0, 'x': 4.38, 'y': 4.42, 'z': 2.68}, {'': '230', 'carat': 0.52, 'clarity': 'IF', 'color': 'F', 'cut': 'Ideal', 'depth': 62.2, 'price': 2783, 'table': 55.0, 'x': 5.14, 'y': 5.18, 'z': 3.21}]
# What are our clarity counts?
tz.countby(lambda x: x['clarity'], diamonds)
{'I1': 741, 'IF': 1790, 'SI1': 13065, 'SI2': 9194, 'VS1': 8171, 'VS2': 12258, 'VVS1': 3655, 'VVS2': 5066}
# What about our max price from above? Reduction!
def comparo(accum, row):
price = row['price']
if price > accum:
return price
else:
return accum
tz.reduce(comparo, diamonds, 0)
18823
# We could have also threaded here
tz.thread_last(diamonds,
(tz.map, lambda x: x['price']),
max)
18823
# Which is faster?
%timeit tz.reduce(comparo, diamonds, 0)
100 loops, best of 3: 8.6 ms per loop
%timeit tz.thread_last(diamonds, (tz.map, lambda x: x['price']), max)
100 loops, best of 3: 11.2 ms per loop
Let's look at another SQL query- say we want clarity and carats where price > 1000:
select count(1)
from diamonds
where price > 1000
group by clarity;
# Toolz has currying!
import toolz.curried as tzc
tzc.pipe(diamonds,
tzc.filter(lambda r: r['price'] > 1000),
tzc.map(lambda r: (r['clarity'],)),
tzc.countby(lambda r: r[0]),
dict)
{'I1': 675, 'IF': 1042, 'SI1': 9978, 'SI2': 8118, 'VS1': 5702, 'VS2': 8647, 'VVS1': 2108, 'VVS2': 3146}
# We can go about this another way as well:
def filter_and_count(kv):
f_and_c = tz.thread_last(kv[1],
(tz.filter, lambda r: r['price'] > 1000),
tz.count)
return kv[0], f_and_c
tz.thread_last(diamonds,
(tz.groupby, 'clarity'),
(tz.itemmap, filter_and_count))
{'I1': 675, 'IF': 1042, 'SI1': 9978, 'SI2': 8118, 'VS1': 5702, 'VS2': 8647, 'VVS1': 2108, 'VVS2': 3146}
# Cleanest/best way: reduceby: Groupby + reduce
def increment(accum, row):
if row['price'] > 1000:
return accum + 1
else:
return accum
tz.reduceby('clarity',
increment,
diamonds, 0)
{'I1': 675, 'IF': 1042, 'SI1': 9978, 'SI2': 8118, 'VS1': 5702, 'VS2': 8647, 'VVS1': 2108, 'VVS2': 3146}
import pandas as pd
# We don't need this to use Pandas, FYI
import numpy as np
# CSV reader is fast!
df = pd.read_csv('diamonds.csv', index_col=0)
# Keep this for later, we're going to overwrite df
df_diamonds = df
df.head()
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
2 | 0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
3 | 0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
4 | 0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
5 | 0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
df.describe()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
count | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 |
mean | 0.797940 | 61.749405 | 57.457184 | 3932.799722 | 5.731157 | 5.734526 | 3.538734 |
std | 0.474011 | 1.432621 | 2.234491 | 3989.439738 | 1.121761 | 1.142135 | 0.705699 |
min | 0.200000 | 43.000000 | 43.000000 | 326.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.400000 | 61.000000 | 56.000000 | 950.000000 | 4.710000 | 4.720000 | 2.910000 |
50% | 0.700000 | 61.800000 | 57.000000 | 2401.000000 | 5.700000 | 5.710000 | 3.530000 |
75% | 1.040000 | 62.500000 | 59.000000 | 5324.250000 | 6.540000 | 6.540000 | 4.040000 |
max | 5.010000 | 79.000000 | 95.000000 | 18823.000000 | 10.740000 | 58.900000 | 31.800000 |
df.groupby('clarity').mean()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
clarity | |||||||
I1 | 1.283846 | 62.734278 | 58.303779 | 3924.168691 | 6.761093 | 6.709379 | 4.207908 |
IF | 0.505123 | 61.510615 | 56.507207 | 2864.839106 | 4.968402 | 4.989827 | 3.061659 |
SI1 | 0.850482 | 61.853042 | 57.662541 | 3996.001148 | 5.888383 | 5.888256 | 3.639845 |
SI2 | 1.077648 | 61.772167 | 57.927181 | 5063.028606 | 6.401370 | 6.397826 | 3.948478 |
VS1 | 0.727158 | 61.667458 | 57.315151 | 3839.455391 | 5.572178 | 5.581828 | 3.441007 |
VS2 | 0.763935 | 61.724417 | 57.417401 | 3924.989395 | 5.657709 | 5.658859 | 3.491478 |
VVS1 | 0.503321 | 61.624651 | 56.884460 | 2523.114637 | 4.960364 | 4.975075 | 3.061294 |
VVS2 | 0.596202 | 61.663778 | 57.024990 | 3283.737071 | 5.218454 | 5.232118 | 3.221465 |
Our previous queries:
select cut, mean(price)
from diamonds
group by cut;
select count(carat)
from diamonds
where price > 1000
group by clarity;
select max(price)
from diamonds
where carat > 1;
select cut, price
from diamonds
where cut in ('Ideal', 'Premium')
order by price desc
limit 10;
Are pretty trivial operations in Pandas:
df.groupby('cut')['price'].mean()
cut Fair 4358.757764 Good 3928.864452 Ideal 3457.541970 Premium 4584.257704 Very Good 3981.759891 Name: price, dtype: float64
df[df['price'] > 1000].groupby('clarity')['carat'].count()
clarity I1 675 IF 1042 SI1 9978 SI2 8118 VS1 5702 VS2 8647 VVS1 2108 VVS2 3146 Name: carat, dtype: int64
df[df['carat'] > 1]['price'].max()
18823
df[df['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
27750 | 2.29 | Premium | I | VS2 | 60.8 | 60 | 18823 | 8.50 | 8.47 | 5.16 |
27748 | 1.51 | Ideal | G | IF | 61.7 | 55 | 18806 | 7.37 | 7.41 | 4.56 |
27747 | 2.07 | Ideal | G | SI2 | 62.5 | 55 | 18804 | 8.20 | 8.13 | 5.11 |
27745 | 2.29 | Premium | I | SI1 | 61.8 | 59 | 18797 | 8.52 | 8.45 | 5.24 |
27744 | 2.00 | Premium | I | VS1 | 60.8 | 59 | 18795 | 8.13 | 8.02 | 4.91 |
27743 | 2.04 | Premium | H | SI1 | 58.1 | 60 | 18795 | 8.37 | 8.28 | 4.84 |
27742 | 2.15 | Ideal | G | SI2 | 62.6 | 54 | 18791 | 8.29 | 8.35 | 5.21 |
27741 | 1.71 | Premium | F | VS2 | 62.3 | 59 | 18791 | 7.57 | 7.53 | 4.70 |
27739 | 2.05 | Ideal | G | SI1 | 61.9 | 57 | 18787 | 8.10 | 8.16 | 5.03 |
27738 | 2.05 | Premium | F | SI2 | 60.2 | 59 | 18784 | 8.28 | 8.33 | 5.00 |
In which I do a bunch of cool Pandas things without a real goal
# I can Transpose things!
df.T
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 53931 | 53932 | 53933 | 53934 | 53935 | 53936 | 53937 | 53938 | 53939 | 53940 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
carat | 0.23 | 0.21 | 0.23 | 0.29 | 0.31 | 0.24 | 0.24 | 0.26 | 0.22 | 0.23 | ... | 0.71 | 0.71 | 0.7 | 0.7 | 0.72 | 0.72 | 0.72 | 0.7 | 0.86 | 0.75 |
cut | Ideal | Premium | Good | Premium | Good | Very Good | Very Good | Very Good | Fair | Very Good | ... | Premium | Premium | Very Good | Very Good | Premium | Ideal | Good | Very Good | Premium | Ideal |
color | E | E | E | I | J | J | I | H | E | H | ... | E | F | E | E | D | D | D | D | H | D |
clarity | SI2 | SI1 | VS1 | VS2 | SI2 | VVS2 | VVS1 | SI1 | VS2 | VS1 | ... | SI1 | SI1 | VS2 | VS2 | SI1 | SI1 | SI1 | SI1 | SI2 | SI2 |
depth | 61.5 | 59.8 | 56.9 | 62.4 | 63.3 | 62.8 | 62.3 | 61.9 | 65.1 | 59.4 | ... | 60.5 | 59.8 | 60.5 | 61.2 | 62.7 | 60.8 | 63.1 | 62.8 | 61 | 62.2 |
table | 55 | 61 | 65 | 58 | 58 | 57 | 57 | 55 | 61 | 61 | ... | 55 | 62 | 59 | 59 | 59 | 57 | 55 | 60 | 58 | 55 |
price | 326 | 326 | 327 | 334 | 335 | 336 | 336 | 337 | 337 | 338 | ... | 2756 | 2756 | 2757 | 2757 | 2757 | 2757 | 2757 | 2757 | 2757 | 2757 |
x | 3.95 | 3.89 | 4.05 | 4.2 | 4.34 | 3.94 | 3.95 | 4.07 | 3.87 | 4 | ... | 5.79 | 5.74 | 5.71 | 5.69 | 5.69 | 5.75 | 5.69 | 5.66 | 6.15 | 5.83 |
y | 3.98 | 3.84 | 4.07 | 4.23 | 4.35 | 3.96 | 3.98 | 4.11 | 3.78 | 4.05 | ... | 5.74 | 5.73 | 5.76 | 5.72 | 5.73 | 5.76 | 5.75 | 5.68 | 6.12 | 5.87 |
z | 2.43 | 2.31 | 2.31 | 2.63 | 2.75 | 2.48 | 2.47 | 2.53 | 2.49 | 2.39 | ... | 3.49 | 3.43 | 3.47 | 3.49 | 3.58 | 3.5 | 3.61 | 3.56 | 3.74 | 3.64 |
10 rows × 53940 columns
# SORT ALL THE THINGS!
df.sort(['price', 'carat'], ascending=False).head()
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
27750 | 2.29 | Premium | I | VS2 | 60.8 | 60 | 18823 | 8.50 | 8.47 | 5.16 |
27749 | 2.00 | Very Good | G | SI1 | 63.5 | 56 | 18818 | 7.90 | 7.97 | 5.04 |
27748 | 1.51 | Ideal | G | IF | 61.7 | 55 | 18806 | 7.37 | 7.41 | 4.56 |
27747 | 2.07 | Ideal | G | SI2 | 62.5 | 55 | 18804 | 8.20 | 8.13 | 5.11 |
27746 | 2.00 | Very Good | H | SI1 | 62.8 | 57 | 18803 | 7.95 | 8.00 | 5.01 |
# Lets use some fake data to show off some stuff:
simple_data_1 = {"int_col": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"str_col": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
"float_col": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
"time_col": ["2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "2015-01-05",
"2015-01-06", "2015-01-07", "2015-01-08", "2015-01-09", "2015-01-10"]}
my_index = ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"]
df = pd.DataFrame(simple_data_1, index=my_index)
df
float_col | int_col | str_col | time_col | |
---|---|---|---|---|
a | 1.5 | 1 | a | 2015-01-01 |
b | 2.5 | 2 | b | 2015-01-02 |
c | 3.5 | 3 | c | 2015-01-03 |
a | 4.5 | 4 | d | 2015-01-04 |
b | 5.5 | 5 | e | 2015-01-05 |
c | 6.5 | 6 | f | 2015-01-06 |
a | 7.5 | 7 | g | 2015-01-07 |
b | 8.5 | 8 | h | 2015-01-08 |
c | 9.5 | 9 | i | 2015-01-09 |
a | 10.5 | 10 | j | 2015-01-10 |
# DataFrames have indices that can be gotten by label or position
df.loc['a']
float_col | int_col | str_col | time_col | |
---|---|---|---|---|
a | 1.5 | 1 | a | 2015-01-01 |
a | 4.5 | 4 | d | 2015-01-04 |
a | 7.5 | 7 | g | 2015-01-07 |
a | 10.5 | 10 | j | 2015-01-10 |
df.iloc[5]
float_col 6.5 int_col 6 str_col f time_col 2015-01-06 Name: c, dtype: object
df[2:4]
float_col | int_col | str_col | time_col | |
---|---|---|---|---|
c | 3.5 | 3 | c | 2015-01-03 |
a | 4.5 | 4 | d | 2015-01-04 |
# New Column! With Missing Data!
df['new_col'] = [np.nan, np.nan, 1.0, 2.0, np.nan, 4.0, 5.0, 9.0, np.nan, 10.0]
df
float_col | int_col | str_col | time_col | new_col | |
---|---|---|---|---|---|
a | 1.5 | 1 | a | 2015-01-01 | NaN |
b | 2.5 | 2 | b | 2015-01-02 | NaN |
c | 3.5 | 3 | c | 2015-01-03 | 1 |
a | 4.5 | 4 | d | 2015-01-04 | 2 |
b | 5.5 | 5 | e | 2015-01-05 | NaN |
c | 6.5 | 6 | f | 2015-01-06 | 4 |
a | 7.5 | 7 | g | 2015-01-07 | 5 |
b | 8.5 | 8 | h | 2015-01-08 | 9 |
c | 9.5 | 9 | i | 2015-01-09 | NaN |
a | 10.5 | 10 | j | 2015-01-10 | 10 |
# Removing missing data!
df.dropna()
float_col | int_col | str_col | time_col | new_col | |
---|---|---|---|---|---|
c | 3.5 | 3 | c | 2015-01-03 | 1 |
a | 4.5 | 4 | d | 2015-01-04 | 2 |
c | 6.5 | 6 | f | 2015-01-06 | 4 |
a | 7.5 | 7 | g | 2015-01-07 | 5 |
b | 8.5 | 8 | h | 2015-01-08 | 9 |
a | 10.5 | 10 | j | 2015-01-10 | 10 |
# Fill missing data!
df.fillna("FOO!")
float_col | int_col | str_col | time_col | new_col | |
---|---|---|---|---|---|
a | 1.5 | 1 | a | 2015-01-01 | FOO! |
b | 2.5 | 2 | b | 2015-01-02 | FOO! |
c | 3.5 | 3 | c | 2015-01-03 | 1 |
a | 4.5 | 4 | d | 2015-01-04 | 2 |
b | 5.5 | 5 | e | 2015-01-05 | FOO! |
c | 6.5 | 6 | f | 2015-01-06 | 4 |
a | 7.5 | 7 | g | 2015-01-07 | 5 |
b | 8.5 | 8 | h | 2015-01-08 | 9 |
c | 9.5 | 9 | i | 2015-01-09 | FOO! |
a | 10.5 | 10 | j | 2015-01-10 | 10 |
# Backfill missing data!
df.fillna(method='bfill')
float_col | int_col | str_col | time_col | new_col | |
---|---|---|---|---|---|
a | 1.5 | 1 | a | 2015-01-01 | 1 |
b | 2.5 | 2 | b | 2015-01-02 | 1 |
c | 3.5 | 3 | c | 2015-01-03 | 1 |
a | 4.5 | 4 | d | 2015-01-04 | 2 |
b | 5.5 | 5 | e | 2015-01-05 | 4 |
c | 6.5 | 6 | f | 2015-01-06 | 4 |
a | 7.5 | 7 | g | 2015-01-07 | 5 |
b | 8.5 | 8 | h | 2015-01-08 | 9 |
c | 9.5 | 9 | i | 2015-01-09 | 10 |
a | 10.5 | 10 | j | 2015-01-10 | 10 |
# Vectorized string methods!
df['str_col'] = ["FOO", "FoO", "Foo", "Fo o", "FOO", "fOO", "fOo", "FoO", "foO", "Foo "]
df
float_col | int_col | str_col | time_col | new_col | |
---|---|---|---|---|---|
a | 1.5 | 1 | FOO | 2015-01-01 | NaN |
b | 2.5 | 2 | FoO | 2015-01-02 | NaN |
c | 3.5 | 3 | Foo | 2015-01-03 | 1 |
a | 4.5 | 4 | Fo o | 2015-01-04 | 2 |
b | 5.5 | 5 | FOO | 2015-01-05 | NaN |
c | 6.5 | 6 | fOO | 2015-01-06 | 4 |
a | 7.5 | 7 | fOo | 2015-01-07 | 5 |
b | 8.5 | 8 | FoO | 2015-01-08 | 9 |
c | 9.5 | 9 | foO | 2015-01-09 | NaN |
a | 10.5 | 10 | Foo | 2015-01-10 | 10 |
df['str_col'].str.lower().str.replace(' ', '')
a foo b foo c foo a foo b foo c foo a foo b foo c foo a foo Name: str_col, dtype: object
# Database style joins!!
left = pd.DataFrame({'key': ['foo', 'bar', 'fizz'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'qux'],
'rval': [4, 5, 6, 7]})
left.merge(right, how='inner')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | bar | 2 | 5 |
right.merge(left, how='outer')
key | rval | lval | |
---|---|---|---|
0 | foo | 4 | 1 |
1 | bar | 5 | 2 |
2 | baz | 6 | NaN |
3 | qux | 7 | NaN |
4 | fizz | NaN | 3 |
left.merge(right, how='left')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | bar | 2 | 5 |
2 | fizz | 3 | NaN |
# Lets make some random timeseries data
dates = pd.date_range('2015-03-25', periods=150, freq='H')
dates
DatetimeIndex(['2015-03-25 00:00:00', '2015-03-25 01:00:00', '2015-03-25 02:00:00', '2015-03-25 03:00:00', '2015-03-25 04:00:00', '2015-03-25 05:00:00', '2015-03-25 06:00:00', '2015-03-25 07:00:00', '2015-03-25 08:00:00', '2015-03-25 09:00:00', ... '2015-03-30 20:00:00', '2015-03-30 21:00:00', '2015-03-30 22:00:00', '2015-03-30 23:00:00', '2015-03-31 00:00:00', '2015-03-31 01:00:00', '2015-03-31 02:00:00', '2015-03-31 03:00:00', '2015-03-31 04:00:00', '2015-03-31 05:00:00'], dtype='datetime64[ns]', length=150, freq='H', tz=None)
time_df = pd.DataFrame(np.random.randint(0, 500, 150), index=dates, columns=["Numeric"])
time_df.head()
Numeric | |
---|---|
2015-03-25 00:00:00 | 451 |
2015-03-25 01:00:00 | 10 |
2015-03-25 02:00:00 | 18 |
2015-03-25 03:00:00 | 113 |
2015-03-25 04:00:00 | 283 |
# RESAMPLE!
time_df.resample('D', how='mean')
Numeric | |
---|---|
2015-03-25 | 211.166667 |
2015-03-26 | 255.041667 |
2015-03-27 | 241.625000 |
2015-03-28 | 291.791667 |
2015-03-29 | 227.625000 |
2015-03-30 | 212.125000 |
2015-03-31 | 307.500000 |
# Convert weird date formats!
the_worst = ['3/25/2014 23:02:00', '2014-03-25 23:01:00', 'March 25 2015']
pd.to_datetime(the_worst)
DatetimeIndex(['2014-03-25 23:02:00', '2014-03-25 23:01:00', '2015-03-25 00:00:00'], dtype='datetime64[ns]', freq=None, tz=None)
# Shift dates!
time_df.shift(1, freq='D').head()
Numeric | |
---|---|
2015-03-26 00:00:00 | 451 |
2015-03-26 01:00:00 | 10 |
2015-03-26 02:00:00 | 18 |
2015-03-26 03:00:00 | 113 |
2015-03-26 04:00:00 | 283 |
# What if I have missing dates?
missing = pd.to_datetime(['2015-03-25', '2015-03-30', '2015-04-05'])
missing_df = pd.DataFrame(np.random.randint(0, 10, 3), index=missing, columns=["Numeric"])
missing_df
Numeric | |
---|---|
2015-03-25 | 1 |
2015-03-30 | 9 |
2015-04-05 | 4 |
missing_df.asfreq('D')
Numeric | |
---|---|
2015-03-25 | 1 |
2015-03-26 | NaN |
2015-03-27 | NaN |
2015-03-28 | NaN |
2015-03-29 | NaN |
2015-03-30 | 9 |
2015-03-31 | NaN |
2015-04-01 | NaN |
2015-04-02 | NaN |
2015-04-03 | NaN |
2015-04-04 | NaN |
2015-04-05 | 4 |
missing_df.asfreq('D', method='pad')
Numeric | |
---|---|
2015-03-25 | 1 |
2015-03-26 | 1 |
2015-03-27 | 1 |
2015-03-28 | 1 |
2015-03-29 | 1 |
2015-03-30 | 9 |
2015-03-31 | 9 |
2015-04-01 | 9 |
2015-04-02 | 9 |
2015-04-03 | 9 |
2015-04-04 | 9 |
2015-04-05 | 4 |
import xray
arr = np.array([[1, 2, 3, 4],
[10, 20, 30, 40],
[100, 200, 300, 400]])
dim0_coords = ['a', 'b', 'c']
dim1_coords = ['foo', 'bar', 'baz', 'qux']
da = xray.DataArray(arr, [('x', dim0_coords), ('y', dim1_coords)])
da
<xray.DataArray (x: 3, y: 4)> array([[ 1, 2, 3, 4], [ 10, 20, 30, 40], [100, 200, 300, 400]]) Coordinates: * x (x) |S1 'a' 'b' 'c' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
# Arrays!
da.values
array([[ 1, 2, 3, 4], [ 10, 20, 30, 40], [100, 200, 300, 400]])
da.dims
('x', 'y')
da.coords
Coordinates: * x (x) |S1 'a' 'b' 'c' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
# But with some Pandas-like powers!
# Index by slice
da[0:2]
<xray.DataArray (x: 2, y: 4)> array([[ 1, 2, 3, 4], [10, 20, 30, 40]]) Coordinates: * x (x) |S1 'a' 'b' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
# Indexing!
da.loc['a']
<xray.DataArray (y: 4)> array([1, 2, 3, 4]) Coordinates: x |S1 'a' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
da.loc['a':'c']
<xray.DataArray (x: 3, y: 4)> array([[ 1, 2, 3, 4], [ 10, 20, 30, 40], [100, 200, 300, 400]]) Coordinates: * x (x) |S1 'a' 'b' 'c' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
da.sel(x=['a', 'c'])
<xray.DataArray (x: 2, y: 4)> array([[ 1, 2, 3, 4], [100, 200, 300, 400]]) Coordinates: * x (x) |S1 'a' 'c' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
da.sel(y='foo')
<xray.DataArray (x: 3)> array([ 1, 10, 100]) Coordinates: * x (x) |S1 'a' 'b' 'c' y |S3 'foo'
da.isel(x=0)
<xray.DataArray (y: 4)> array([1, 2, 3, 4]) Coordinates: x |S1 'a' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
# Do numpy stuff
np.mean(da), np.sum(da)
(<xray.DataArray ()> array(92.5), <xray.DataArray ()> array(1110))
np.sin(da)
<xray.DataArray (x: 3, y: 4)> array([[ 0.84147098, 0.90929743, 0.14112001, -0.7568025 ], [-0.54402111, 0.91294525, -0.98803162, 0.74511316], [-0.50636564, -0.8732973 , -0.99975584, -0.85091936]]) Coordinates: * y (y) |S3 'foo' 'bar' 'baz' 'qux' * x (x) |S1 'a' 'b' 'c'
# Broadcast!
da + 100
<xray.DataArray (x: 3, y: 4)> array([[101, 102, 103, 104], [110, 120, 130, 140], [200, 300, 400, 500]]) Coordinates: * y (y) |S3 'foo' 'bar' 'baz' 'qux' * x (x) |S1 'a' 'b' 'c'
# Do all of the above by label
da.mean(dim='x')
<xray.DataArray (y: 4)> array([ 37., 74., 111., 148.]) Coordinates: * y (y) |S3 'foo' 'bar' 'baz' 'qux'
da.sum(dim='y')
<xray.DataArray (x: 3)> array([ 10, 100, 1000]) Coordinates: * x (x) |S1 'a' 'b' 'c'
da
<xray.DataArray (x: 3, y: 4)> array([[ 1, 2, 3, 4], [ 10, 20, 30, 40], [100, 200, 300, 400]]) Coordinates: * x (x) |S1 'a' 'b' 'c' * y (y) |S3 'foo' 'bar' 'baz' 'qux'
# Group stuff
da.groupby('x').mean()
<xray.DataArray (x: 3)> array([ 2.5, 25. , 250. ]) Coordinates: * x (x) |S1 'a' 'b' 'c'
da.groupby('y').sum()
<xray.DataArray (y: 4)> array([111, 222, 333, 444]) Coordinates: * y (y) |S3 'foo' 'bar' 'baz' 'qux'
np.random.randn(2, 2, 3)
array([[[ 2.51076277, 1.0696244 , 1.33092369], [ 1.8807506 , -0.2231198 , -0.24000785]], [[-0.67361403, -0.21962662, 0.11517111], [-0.79717326, 0.50143901, 0.08006242]]])
# DataSets
x_y = np.array([[1, 1.1, 1.2], [2, 2.1, 2.2], [3, 3.1, 3.2]])
z_coords = np.array(["10s", "20s"])
data_cube = np.array([[[10, 10, 10],
[10, 10, 10],
[10, 10, 10]],
[[20, 20, 20],
[20, 20, 20],
[20, 20, 20]]])
ds = xray.Dataset({"cube": (["z", "x", "y"], data_cube)},
coords={"z": z_coords,
"x": ["col_1", "col_2", "col_3"],
"y": ["row_1", "row_2", "row_3"]})
ds
<xray.Dataset> Dimensions: (x: 3, y: 3, z: 2) Coordinates: * y (y) |S5 'row_1' 'row_2' 'row_3' * x (x) |S5 'col_1' 'col_2' 'col_3' * z (z) |S3 '10s' '20s' Data variables: cube (z, x, y) int64 10 10 10 10 10 10 10 10 10 20 20 20 20 20 20 20 ...
ds.sum(dim="z")
<xray.Dataset> Dimensions: (x: 3, y: 3) Coordinates: * y (y) |S5 'row_1' 'row_2' 'row_3' * x (x) |S5 'col_1' 'col_2' 'col_3' Data variables: cube (x, y) int64 30 30 30 30 30 30 30 30 30
ds.mean(dim="x")
<xray.Dataset> Dimensions: (y: 3, z: 2) Coordinates: * y (y) |S5 'row_1' 'row_2' 'row_3' * z (z) |S3 '10s' '20s' Data variables: cube (z, y) float64 10.0 10.0 10.0 20.0 20.0 20.0
ds.groupby("x").sum()
<xray.Dataset> Dimensions: (x: 3) Coordinates: * x (x) |S5 'col_1' 'col_2' 'col_3' Data variables: cube (x) int64 90 90 90
For this demo, we're going to focus on a couple queries we used in the Pandas demo:
select cut, mean(price)
from diamonds
groupby cut;
select count(carat)
from diamonds
where price > 1000
group by clarity;
import blaze as bz
bz_diamonds = bz.symbol('diamonds', bz.discover(df_diamonds))
type(bz_diamonds)
blaze.expr.expressions.Symbol
mean_price = bz.by(bz_diamonds.cut, price=bz_diamonds.price.mean())
clarity_count = bz.by(bz_diamonds[bz_diamonds.price > 1000].clarity,
count=bz_diamonds.carat.count())
# We haven't actually computed anything yet!
# Let's make Pandas compute it.
bz.compute(mean_price, df_diamonds)
cut | price | |
---|---|---|
0 | Fair | 4358.757764 |
1 | Good | 3928.864452 |
2 | Ideal | 3457.541970 |
3 | Premium | 4584.257704 |
4 | Very Good | 3981.759891 |
bz.compute(clarity_count, df_diamonds)
clarity | count | |
---|---|---|
0 | I1 | 741 |
1 | IF | 1790 |
2 | SI1 | 13065 |
3 | SI2 | 9194 |
4 | VS1 | 8171 |
5 | VS2 | 12258 |
6 | VVS1 | 3655 |
7 | VVS2 | 5066 |
# Blaze/Odo make it easy to move data between containers
# Note that we have an empty table already created
pg_datasource = bz.odo(df_diamonds, "postgresql://postgres:postgres@localhost/pydata::diamonds")
# Now we're going to use Postgres as our computation engine
result = bz.compute(clarity_count, pg_datasource)
result
<sqlalchemy.sql.selectable.Select at 0x11c27e610; Select object>
# I don't want a selectable. I want a DataFrame
# odo again
bz.odo(bz.compute(clarity_count, pg_datasource), pd.DataFrame)
clarity | count | |
---|---|---|
0 | IF | 1790 |
1 | I1 | 741 |
2 | VVS1 | 3655 |
3 | VS2 | 12258 |
4 | VS1 | 8171 |
5 | VVS2 | 5066 |
6 | SI2 | 9194 |
7 | SI1 | 13065 |
You can use any SQL supported by SQLAlchemy as your computation. It also supports Python lists, Spark DataFrames, MongoDB, Numpy arrays...
import bcolz
dc = bcolz.ctable.fromdataframe(df_diamonds)
dc.cols
carat : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 0.23 0.21 0.23 ..., 0.7 0.86 0.75] cut : carray((53940,), |S9) nbytes: 474.08 KB; cbytes: 315.96 KB; ratio: 1.50 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') ['Ideal' 'Premium' 'Good' ..., 'Very Good' 'Premium' 'Ideal'] color : carray((53940,), |S1) nbytes: 52.68 KB; cbytes: 128.00 KB; ratio: 0.41 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') ['E' 'E' 'E' ..., 'D' 'H' 'D'] clarity : carray((53940,), |S4) nbytes: 210.70 KB; cbytes: 256.00 KB; ratio: 0.82 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') ['SI2' 'SI1' 'VS1' ..., 'SI1' 'SI2' 'SI2'] depth : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 407.65 KB; ratio: 1.03 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 61.5 59.8 56.9 ..., 62.8 61. 62.2] table : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 317.15 KB; ratio: 1.33 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 55. 61. 65. ..., 60. 58. 55.] price : carray((53940,), int64) nbytes: 421.41 KB; cbytes: 279.99 KB; ratio: 1.51 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 326 326 327 ..., 2757 2757 2757] x : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 468.10 KB; ratio: 0.90 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 3.95 3.89 4.05 ..., 5.66 6.15 5.83] y : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 467.91 KB; ratio: 0.90 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 3.98 3.84 4.07 ..., 5.68 6.12 5.87] z : carray((53940,), float64) nbytes: 421.41 KB; cbytes: 468.20 KB; ratio: 0.90 cparams := cparams(clevel=5, shuffle=True, cname='blosclz') [ 2.43 2.31 2.31 ..., 3.56 3.74 3.64]
dsize = dc.cbytes / 2**20.
print("Total size for the ctable: {} MB".format(dsize))
print("Compression ratio the ctable: {}".format((dc.nbytes / float(dc.cbytes))))
Total size for the ctable: 3.49322795868 MB Compression ratio the ctable: 1.03081835096
# You can do DataFrame-like stuff
dc["cut == 'Premium'"]
array([(0.21, 'Premium', 'E', 'SI1', 59.8, 61.0, 326, 3.89, 3.84, 2.31), (0.29, 'Premium', 'I', 'VS2', 62.4, 58.0, 334, 4.2, 4.23, 2.63), (0.22, 'Premium', 'F', 'SI1', 60.4, 61.0, 342, 3.88, 3.84, 2.33), ..., (0.71, 'Premium', 'F', 'SI1', 59.8, 62.0, 2756, 5.74, 5.73, 3.43), (0.72, 'Premium', 'D', 'SI1', 62.7, 59.0, 2757, 5.69, 5.73, 3.58), (0.86, 'Premium', 'H', 'SI2', 61.0, 58.0, 2757, 6.15, 6.12, 3.74)], dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])
dc["(cut == 'Ideal') & (price > 1)"]
array([(0.23, 'Ideal', 'E', 'SI2', 61.5, 55.0, 326, 3.95, 3.98, 2.43), (0.23, 'Ideal', 'J', 'VS1', 62.8, 56.0, 340, 3.93, 3.9, 2.46), (0.31, 'Ideal', 'J', 'SI2', 62.2, 54.0, 344, 4.35, 4.37, 2.71), ..., (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53), (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5), (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])
# We can do the same thing with the ctable on disk!
diskdc = dc.copy(rootdir='diamonds')
diskdc["(cut == 'Ideal') & (price > 1000)"]
array([(0.7, 'Ideal', 'E', 'SI1', 62.5, 57.0, 2757, 5.7, 5.72, 3.57), (0.7, 'Ideal', 'G', 'VS2', 61.6, 56.0, 2757, 5.7, 5.67, 3.5), (0.74, 'Ideal', 'G', 'SI1', 61.6, 55.0, 2760, 5.8, 5.85, 3.59), ..., (0.71, 'Ideal', 'G', 'VS1', 61.4, 56.0, 2756, 5.76, 5.73, 3.53), (0.72, 'Ideal', 'D', 'SI1', 60.8, 57.0, 2757, 5.75, 5.76, 3.5), (0.75, 'Ideal', 'D', 'SI2', 62.2, 55.0, 2757, 5.83, 5.87, 3.64)], dtype=[('carat', '<f8'), ('cut', 'S9'), ('color', 'S1'), ('clarity', 'S4'), ('depth', '<f8'), ('table', '<f8'), ('price', '<i8'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])
import os
for root, dirs, files in os.walk('diamonds'):
level = root.replace('diamonds', '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
diamonds/ __attrs__ __rootdirs__ carat/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage clarity/ __attrs__ data/ __0.blp meta/ sizes storage color/ __attrs__ data/ __0.blp meta/ sizes storage cut/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage depth/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage price/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage table/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage x/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage y/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage z/ __attrs__ data/ __0.blp __1.blp meta/ sizes storage
import dask.array as da
import dask.dataframe as dd
import dask.bag as db
d_arr = da.from_array(np.random.randn(100000), chunks=100)
d_arr
dask.array<x_1, shape=(100000,), chunks=((100, 100, 100, ..., 100, 100)), dtype=float64>
d_arr.sum()
dask.array<x_3, shape=(), chunks=(), dtype=float64>
Wait, what happened? Why didn't I get back an integer?
Because dask is lazily evaluated- we must compute
it!
d_arr.sum().compute()
224.03863985322081
d_arr.mean().compute()
0.0022403863985322081
d_arr.max().compute()
4.3235191229285963
import dask.dataframe as dd
ddf = dd.read_csv('diamonds.csv')
ddf.groupby('cut')['price'].mean().compute()
cut Fair 4358.757764 Good 3928.864452 Ideal 3457.541970 Premium 4584.257704 Very Good 3981.759891 Name: price, dtype: float64
ddf['price'].sum().compute()
212135217
ddf[ddf['price'] > 1000].groupby('clarity')['carat'].count().compute()
clarity I1 675 IF 1042 SI1 9978 SI2 8118 VS1 5702 VS2 8647 VVS1 2108 VVS2 3146 Name: carat, dtype: int64
We're not quite feature complete yet...
ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10]
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-117-b0c21ae068eb> in <module>() ----> 1 ddf[ddf['cut'].isin(['Ideal', 'Premium'])].sort('price', ascending=False)[:10] /Users/robstory/src/pydataseattle2015/.env/lib/python2.7/site-packages/dask/dataframe/core.pyc in __getattr__(self, key) 584 return self[key] 585 except NotImplementedError: --> 586 raise e 587 588 def __dir__(self): AttributeError: 'DataFrame' object has no attribute 'sort'
diamonds_b = db.from_sequence(diamonds)
tz.thread_last(diamonds_b, (tz.map, lambda x: x['price']), max)
18823
tzc.pipe(diamonds_b,
tzc.filter(lambda r: r['price'] > 1000),
tzc.map(lambda r: (r['clarity'],)),
tzc.countby(lambda r: r[0]),
dict)
{'I1': 675, 'IF': 1042, 'SI1': 9978, 'SI2': 8118, 'VS1': 5702, 'VS2': 8647, 'VVS1': 2108, 'VVS2': 3146}