#!/usr/bin/env python
# coding: utf-8

# # Updating a dictionary
# 
# Often, we want to create a copy of a dictionary and update it. For example, we have a default set of parameters. We want to update them, but without disturbing the original parameter list.
# 
# There are 2 ways of doing this.
# 
# 1. Make a copy of the dictionary and update it with the new dictionary. **This is twice as fast**
# 2. Add the `.items()` list if both dictionaries, and make a new `dict` out of it.
# 
# Here is the benchmark:
# 
#                                     update    append
#     1. make a copy and update       1.58µs    1.66µs
#     2. add .items() and dictify     2.81µs    3.36µs

# In[1]:


base = {x:x for x in range(20)}
same = {x:x for x in range(20)}
incr = {x:x for x in range(20, 40)}

get_ipython().run_line_magic('timeit', 'y=dict(base); y.update(same)')
get_ipython().run_line_magic('timeit', 'y=dict(base); y.update(incr)')

base = base.items()
same = same.items()
incr = incr.items()

get_ipython().run_line_magic('timeit', 'dict(base + same)')
get_ipython().run_line_magic('timeit', 'dict(base + incr)')


# # Large string creation
# 
# Array joins are faster than successive appending
# 
#                      1,000    10,000    100,000  1,000,000
#       appending      226µs    2.87ms     15.8ms      484ms
#     array joins      116µs    1.11ms     11.2ms      146ms

# In[4]:


def string_append(s, count):
    result = ''
    for x in range(count):
        result += s

def array_join(s, count):
    result = []
    for x in range(count):
        result.append(s)
    result = ''.join(result)

for count in [1000, 10000, 100000, 1000000]:
    print count, 'concatenations'
    get_ipython().run_line_magic('timeit', "string_append('abc', count)")
    get_ipython().run_line_magic('timeit', "array_join('abc', count)")


# # Date parsing
# 
# Date parsing in Python is quite slow, especially for large arrays.
# Here's a benchmark of various approaches.
# 
#     to_datetime: 7740 ms
#     dateutil:    6970 ms
#     strptime:    1660 ms
#     manual:       253 ms
#     lookup:         9 ms
# 
# Manual string-array-based parsing of dates is significantly faster.
# If there aren't many dates, lookups are *MUCH* faster.

# In[2]:


import time
import datetime
import dateutil.parser
import pandas as pd

s = pd.Series(['01-31-2012']*100000)


# In[5]:


# Use Pandas' built-in to_datetime
get_ipython().run_line_magic('timeit', 'pd.to_datetime(s)')


# In[6]:


# Use dateutil.parser
get_ipython().run_line_magic('timeit', 's.apply(dateutil.parser.parse)')


# In[7]:


# Parse using datetime.strptime
get_ipython().run_line_magic('timeit', "s.apply(lambda v: datetime.datetime.strptime(v, '%m-%d-%Y'))")


# In[8]:


# Manually parse the date
get_ipython().run_line_magic('timeit', 's.apply(lambda v: datetime.datetime(int(v[6:10]), int(v[0:2]), int(v[3:5])))')


# In[9]:


def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    return s.map({date:pd.to_datetime(date) for date in s.unique()})

get_ipython().run_line_magic('timeit', 'lookup(s)')


# # Mean vs Median
# 
# Mean is *much (~100 times) faster* to calculate than median.

# In[9]:


import time
import numpy

data = numpy.random.rand(50000000)


# In[10]:


timeit numpy.mean(data)


# In[11]:


timeit numpy.median(data)


# # Reading data
# 
# HDF5 is the fastest way of reading tabular data.
# 
#     csv.DictReader:         2.78 s
#     pickle:                 2.41 s
#     json:                   2.39 s
#     json-array:               799 ms
#     csv.reader:               478 ms
#     pd.read_csv               355 ms
#     pd.read_pickle:           319 ms
#     pd.read_hdf (table)       169 ms
#     pd.read_hdf (stored)      123 ms

# In[1]:


# First, create a set of data files
words = 'ad adipisicing aliqua aliquip amet anim aute cillum commodo consectetur consequat culpa cupidatat deserunt do dolor dolore duis ea eiusmod elit enim esse est et eu ex excepteur exercitation fugiat id in incididunt ipsum irure labore laboris laborum lorem magna minim mollit nisi non nostrud nulla occaecat officia pariatur proident qui quis reprehenderit sed sint sit sunt tempor ullamco ut velit veniam voluptate'.split()

# Create the data in memory
data = []
for row in range(0, 1000000):
    data.append({
        'A': words[row % len(words)],
        'B': chr(64 + (row % 62)),
        'C': row,
        'D': row + 1,
        'E': row + 2,
        'F': row + 3,
    })

# Save CSV
import csv
keys = sorted(data[0].keys())
out = csv.DictWriter(open('sample.data.csv', 'w'),
    fieldnames=keys,
    lineterminator='\n')
out.writerow(dict(zip(keys, keys)))
out.writerows(data)

# Save JSON
import json
json.dump(data, open('sample.data.json', 'w'), separators= (',', ':'))

# Save JSON-array
import json
json.dump([data[0].keys()] + [row.values() for row in data],
    open('sample.data-array.json', 'w'),
    separators= (',', ':'))

# Save pickle
import cPickle as pickle
pickle.dump(data, open('sample.data.pickle', 'wb'), pickle.HIGHEST_PROTOCOL)

# Save pandas pickle
import pandas as pd
df = pd.DataFrame(data, columns=data[0].keys())
df.to_pickle('sample.data.pandas')

# Save HDF5
df.to_hdf('sample.data.h5', 'stored')
df.to_hdf('sample.data.h5', 'table', table=True)


# In[2]:


import time
import csv
import json
import cPickle as pickle
import pandas as pd

get_ipython().run_line_magic('timeit', "list(csv.DictReader(open('sample.data.csv')))")
get_ipython().run_line_magic('timeit', "pickle.load(open('sample.data.pickle', 'rb'))")
get_ipython().run_line_magic('timeit', "json.load(open('sample.data.json'))")
get_ipython().run_line_magic('timeit', "json.load(open('sample.data-array.json'))")
get_ipython().run_line_magic('timeit', "list(csv.reader(open('sample.data.csv')))")
get_ipython().run_line_magic('timeit', "pd.read_csv('sample.data.csv')")
get_ipython().run_line_magic('timeit', "pd.read_pickle('sample.data.pandas')")
get_ipython().run_line_magic('timeit', "pd.read_hdf('sample.data.h5', 'table')")
get_ipython().run_line_magic('timeit', "pd.read_hdf('sample.data.h5', 'stored')")


# # Templates vs lxml vs cElementTree
# 
# This is the time taken to generate a bar chart, in µs. The output could either be xml (etree) or text.
# 
#                   xml text
#     template       68   35
#     lxml           73   87
#     cElementTree   23  247
# 
# For string output, tornado templates are extremely fast. Even for etree, it's faster than using lxml directly. However, if you only want etree output and not string, cElementTree is faster.
# 
# To me, the template approach with lxml.fromstring appears optimal.

# In[1]:


from IPython.display import HTML
from tornado import template

using_template = template.Template('''
<svg width="100" height="50">
  {% for i, x in enumerate(series) %}
    <rect x="{{ 10 * i }}" width="10" y="{{ 50 - 10 * x }}" height="{{ 10 * x }}" fill="#88f" stroke="#fff"/>
  {% end %}
</svg>
''', autoescape=None).generate

HTML(using_template(series=[1,2,3,4,3,2,1]))


# In[2]:


from lxml import etree

def using_lxml(series):
    root = etree.Element('svg', width="100", height="50")
    for i, x in enumerate(series):
        rect = etree.SubElement(root, 'rect',
            x      = '%d' % (10 * i),
            width  = '10',
            y      = '%d' % (50 - 10 * x),
            height = '%d' % (10 * x),
            fill   = '#88f',
            stroke = '#fff')
    return root

HTML(etree.tostring(using_lxml(series=[1,2,3,4,3,2,1])))


# In[3]:


import xml.etree.cElementTree as cElementTree

def using_cElementTree(series):
    root = cElementTree.Element('svg', width="100", height="50")
    for i, x in enumerate(series):
        rect = cElementTree.SubElement(root, 'rect',
            x      = '%d' % (10 * i),
            width  = '10',
            y      = '%d' % (50 - 10 * x),
            height = '%d' % (10 * x),
            fill   = '#88f',
            stroke = '#fff')
    return root

HTML(cElementTree.tostring(using_cElementTree(series=[1,2,3,4,3,2,1])))


# In[6]:


# Create etree output
get_ipython().run_line_magic('timeit', 'etree.fromstring(using_template(series=[1,2,3,4,3,2,1]))')
get_ipython().run_line_magic('timeit', 'using_lxml(series=[1,2,3,4,3,2,1])')
get_ipython().run_line_magic('timeit', 'using_cElementTree(series=[1,2,3,4,3,2,1])')

# Create string output
get_ipython().run_line_magic('timeit', 'using_template(series=[1,2,3,4,3,2,1])')
get_ipython().run_line_magic('timeit', 'etree.tostring(using_lxml(series=[1,2,3,4,3,2,1]))')
get_ipython().run_line_magic('timeit', 'cElementTree.tostring(using_cElementTree(series=[1,2,3,4,3,2,1]))')


# # Range search
# 
# Here, we're trying to find where a value fits in a list of numbers. For example, in the list [1, 3, 7, 9], the number 4 would be just after the 2nd element 3.
# 
# The summary is: use `numpy.searchsorted()` -- it's blazingly fast.
# 
#      37,000 µs  For loop    
#       5,790 µs  Numpy filtering
#       2,270 µs  Numpy filtering on sorted values
#       1,850 µs  Numpy index search on sorted values
#           1 µs  numpy.searchsorted()
# 
# Having read [this post](http://blog.clifreeder.com/blog/2013/04/21/ruby-is-too-slow-for-programming-competitions/) on Ruby being slow, I thought I'd check the same with Python. I got it running fairly fast, but there was one piece that was taking a fair bit of time: *counting numbers in a range*. Here's the slow version:

# In[1]:


values = range(1000000)
def count(values, a, b):
    count = 0
    for x in values:
        if a <= x <= b:
            count += 1
    return count

get_ipython().run_line_magic('timeit', 'count(values, 250000, 750000)')


# Of course, running a loop in Python for numbers is never a good idea. Let's move this to NumPy.

# In[2]:


values = numpy.random.rand(1000000)
get_ipython().run_line_magic('timeit', '((.25 <= values) & (values <= .75)).sum()')


# That's not bad, but it could get a lot better. First, let's sort the values and try it.

# In[3]:


values.sort()
get_ipython().run_line_magic('timeit', '((.25 <= values) & (values <= .75)).sum()')


# Just like that, it's faster. But we can do much better. Given that it's already sorted, what if we just found the index?

# In[4]:


get_ipython().run_line_magic('timeit', '(values <= .75).argmin() - (.25 <= values).argmax()')


# A bit faster. It's wasteful of memory, though -- having to create 2 new arrays just to find the position of these two numbers. What if we searched for these?

# In[5]:


get_ipython().run_line_magic('timeit', 'numpy.searchsorted(values, .75) - numpy.searchsorted(values, .25)')


# That's 1.45 *micro*seconds. It's *25 thousand* times faster than the original code, and *four thousand* times faster than the original NumPy code.
# 
# If there's one thing I keep re-learning, it's that there's always a faster way of doing it, and if you really want to, you'll probably find it.

# # Next power of 10
# 
# The next power of 10 for 4 is 10^1. For 40, it's 10^2. For 400, it's 10^3. For 0.04, it's 10^-1. And so on.
# 
# Most methods of calculating it are fast enough.

# In[1]:


data = pd.Series(10 ** (6 * np.random.rand(10000) - 3))


# In[2]:


def iterative(v):
    i = 1
    if v > 1:
        n = 0
        while i < v:
            i, n = i * 10, n + 1
    else:
        n = 1
        while i > v:
            i, n = i / 10., n - 1
    return n

get_ipython().run_line_magic('timeit', 'data.apply(iterative)')


# In[3]:


get_ipython().run_line_magic('timeit', 'numpy.ceil(numpy.log10(data))')


# In[4]:


get_ipython().run_line_magic('timeit', 'data.apply(lambda v: numpy.ceil(numpy.log10(v)))')


# # Hierarchical subtotals
# 
# Given a DataFrame like this:
# 
#     A  B  C  val
#     X  X  X    0
#     X  X  Y    1
#     X  Y  X    2
#     X  Y  Y    3
#     Y  X  X    4
#     Y  X  Y    5
#     Y  Y  X    6
#     Y  Y  Y    7
# 
# ... create a DataFrame like this, with subtotals.
# 
#     A  B  C  val level
#     na na na  28     0
#     X  na na   6     1
#     X  X  na   1     2
#     X  X  X    0     3
#     X  X  Y    1     3
#     X  Y  na   5     2
#     X  Y  X    2     3
#     X  Y  Y    3     3
#     Y  na na  22     1
#     Y  X  na   9     2
#     Y  X  X    4     3
#     Y  X  Y    5     3
#     Y  Y  na  13     2
#     Y  Y  X    6     3
#     Y  Y  Y    7     3

# In[57]:


data = pd.DataFrame({'A': list('XXXXYYYY'), 'B': list('XXYYXXYY'), 'C': list('XYXYXYXY'), 'val': range(8)})


# In[85]:


groups = ['A', 'B', 'C']

def subtotal(data, groups, agg):
    frames = []
    for level in range(1, 1 + len(groups)):
        frame = data.groupby(groups[:level], sort=False, as_index=False).agg(agg)
        frame['level'] = level
        frames.append(frame)
    df = pd.concat(frames)
    for group in groups:
        df[group].fillna('', inplace=True)
    return df.sort(groups).set_index(groups)

print subtotal(data, groups=groups, agg={'val': 'sum'})


# This is faster than the existing `layout.hierarchy`

# In[91]:


import layout
odi = pd.read_csv('d:/site/gramener.com/viz/autolyse/data/odi-batting.csv', dtype={'Runs':float})
groups = ['Weekday', 'Country', 'Player']
agg = {'Runs': 'sum'}
get_ipython().run_line_magic('timeit', 'subtotal(odi, groups, agg)')
get_ipython().run_line_magic('timeit', "list(layout.hierarchy(odi, groups, agg=agg, size=lambda df: df['Runs'].sum()))")


# In[ ]:


# stack(series, groupby)


# # Numba
# 
# I'm trying to see how fast numba is. `autojit(fn)` makes `fn` faster. `numpy.sum` and `@autojit` take about the same time. Python loops are much slower.
# 
# Looks like @autojit is a decent replacement for `numpy.vectorize`.

# In[2]:


from numba import autojit

def slow_sum(arr):
    M, N = arr.shape
    result = 0.0
    for i in range(M):
        for j in range(N):
            result += arr[i,j]
    return result

fast_sum = autojit(slow_sum)


# In[3]:


get_ipython().run_line_magic('timeit', 'numpy.sum(numpy.random.rand(1000,1000))')
get_ipython().run_line_magic('timeit', 'fast_sum(numpy.random.rand(1000,1000))')
get_ipython().run_line_magic('timeit', 'slow_sum(numpy.random.rand(1000,1000))')


# # Shifting a Numpy array
# 
# If you have an array like this: `[3,4,5,6,7]` and you want to move it to the right dropping the last, filling left with nans: `[nan,3,4,5,6]`, what's the fastest way?
# 
# Answer:
# 
#     result = numpy.roll(array, 1)
#     result[0] = numpy.nan

# In[10]:


data = numpy.random.rand(1001)

get_ipython().run_line_magic('timeit', 'result = numpy.insert(data, 0, numpy.nan)[:-2]')
get_ipython().run_line_magic('timeit', 'result = numpy.roll(data, 1); result[:1] = numpy.nan')


# # Voronoi diagrams
# 
# [Voronoi diagrams](http://en.wikipedia.org/wiki/Voronoi_diagram) takes a set of points, and creates polygons enclosing the space closer to each point than any other. This is the dual of [Delaunay triangulation](http://en.wikipedia.org/wiki/Delaunay_triangulation), which matplotlib and scipy provide by default, and can also be created directly on NumPy.
# 
# Here's the speed generating via various methods:
# 
#                                            Method  Time (10K)    Time (100K)
#     matplotlib.delaunay.triangulate.Triangulation  16.5ms        222ms
#                         voronoi() using the above  41.9ms        793ms
#                            scipy.spatial.Delaunay  51.4ms        797ms

# In[3]:


import numpy


# In[4]:


scale = .9
small = (1 - scale)/2 + scale * numpy.random.rand(2, 10000)
large = (1 - scale)/2 + scale * numpy.random.rand(2, 100000)


# In[5]:


import matplotlib.delaunay.triangulate as tri
get_ipython().run_line_magic('timeit', 'tri.Triangulation(*small)')
get_ipython().run_line_magic('timeit', 'tri.Triangulation(*large)')


# In[6]:


def voronoi(X, Y):
    ''' Return line segments describing the voronoi diagram of X and Y '''

    # Get the points X, Y into a matrix P.
    P = numpy.zeros((X.size+4, 2))
    P[:X.size, 0], P[:Y.size, 1] = X, Y

    # Add four points at (pseudo) "infinity"
    m = max(numpy.abs(X).max(), numpy.abs(Y).max()) * 1e5
    P[X.size:, 0] = -m, -m, +m, +m
    P[Y.size:, 1] = -m, +m, -m, +m

    # Delaunay triangulate, and get the circumcenters
    D = tri.Triangulation(P[:, 0], P[:, 1])
    C = D.circumcenters

    # D.triangle_neighbours = 3 neighbours.
    # Each neighbourhood represents a line.

    n = len(C)
    tgt = D.triangle_neighbors
    src = (numpy.zeros_like(tgt).T + numpy.arange(n)).T

    # Remove all -1s
    positives = tgt >= 0
    n = positives.sum()
    src = src[positives].reshape(n)
    tgt = tgt[positives].reshape(n)

    # TODO: Clip to get polygons
    # --------------------------

    # Get areas
    # ---------
    # http://www.mathopenref.com/coordpolygonarea.html
    csrc = C[src]
    ctgt = C[tgt]
    areas = csrc[:,0] * ctgt[:,1] - csrc[:,1] * ctgt[:,0]
    # print areas
    # Now add up the areas by the indices given in src

    # Get the circumcenters
    return numpy.concatenate((C[tgt].reshape(n, 1, 2), C[src].reshape(n, 1, 2)), axis=1)

get_ipython().run_line_magic('timeit', 'voronoi(small[0,:], small[1,:])')
get_ipython().run_line_magic('timeit', 'voronoi(large[0,:], large[1,:])')


# In[8]:


from scipy.spatial import Voronoi
get_ipython().run_line_magic('timeit', 'Voronoi(small.T)')
get_ipython().run_line_magic('timeit', 'Voronoi(large.T)')


# # HDF5 vs SQLite3 vs PostgreSQL
# 
# Which has the faster insert performance? Which has the faster read performance? This is specifically on a key-value index.

# In[1]:


import random

words = 'ad adipisicing aliqua aliquip amet anim aute cillum commodo consectetur consequat culpa cupidatat deserunt do dolor dolore duis ea eiusmod elit enim esse est et eu ex excepteur exercitation fugiat id in incididunt ipsum irure labore laboris laborum lorem magna minim mollit nisi non nostrud nulla occaecat officia pariatur proident qui quis reprehenderit sed sint sit sunt tempor ullamco ut velit veniam voluptate'.split()

def get_data(size, keylen=5, vallen=10):
    'Return an array of random key, int, str combinations'
    result = []
    hi = len(words) - 1
    keys = set()
    for index in range(size):
        while True:
            key = ' '.join(words[random.randint(0, hi)] for i in range(keylen))
            if key not in keys:
                break
        keys.add(key)
        num = random.randint(0, 10000000)
        val = ' '.join(words[random.randint(0, hi)] for i in range(vallen))
        result.append([key, num, val])
    return result


# In[2]:


import time
import sqlite3

def insert_sqlite3(data, drop=True):
    conn = sqlite3.connect('.test.sqlite3')
    try:
        if drop:
            conn.execute('DROP TABLE IF EXISTS test')
        conn.execute('CREATE TABLE IF NOT EXISTS test (k TEXT, n INTEGER, v TEXT, PRIMARY KEY(k))')
        start = time.time()
        conn.executemany('INSERT INTO test VALUES (?, ?, ?)', data)
        conn.commit()
        return time.time() - start
    finally:
        conn.close()


# In[3]:


import time
import psycopg2

def insert_postgres(data, drop=True):
    conn = psycopg2.connect('host=localhost dbname=test user=postgres')
    try:
        cur = conn.cursor()
        if drop:
            cur.execute('DROP TABLE IF EXISTS test')
        cur.execute('CREATE TABLE IF NOT EXISTS test (k VARCHAR(70), n INTEGER, v VARCHAR(300), PRIMARY KEY(k))')
        conn.commit()
        start = time.time()
        cur = conn.cursor()
        cur.executemany('INSERT INTO test VALUES (%s, %s, %s)', data)
        conn.commit()
        return time.time() - start
    finally:
        cur.close()
        conn.close()


# In[17]:


import time
import tables

class Test(tables.IsDescription):
    k = tables.StringCol(itemsize=70, pos=0)
    n = tables.Int16Col(pos=1)
    v = tables.StringCol(itemsize=300, pos=2)
    
def insert_hdf5(data, drop=True):
    handle = tables.open_file('.test.h5', mode='w')
    try:
        root = handle.root
        table = handle.create_table(root, 'test', Test)
        insert = table.row
        start = time.time()
        for row in data:
            insert['k'], insert['n'], insert['v'] = row
            insert.append()
        insert.append()
        table.flush()
        return time.time() - start
    finally:
        handle.close()


# In[19]:


print('  size   sqlite3  postgres      hdf5')
for size in (1, 10, 100, 1000, 10000, 25000):
    print(
        '% 6d' % size,
        '% 7.1fms' % (insert_sqlite3(get_data(size), drop=True) * 1000),
        '% 7.1fms' % (insert_postgres(get_data(size), drop=True) * 1000),
        '% 7.1fms' % (insert_hdf5(get_data(size), drop=True) * 1000)
    )


# # How fast is data access?
# 
# When it comes to data, the performance of the CPU/Memory, disk and database are key. This script measures how fast your system performs on these parameters.
# 
# The whole script should run under a minute on most reasonably fast systems.

# In[1]:


from __future__ import print_function

import io
import time
import sqlalchemy
import numpy as np
import pandas as pd
from pathlib import Path

class Timer:
    def __init__(self, msg):
        self.msg = msg

    def __enter__(self):
        self.start = time.clock()

    def __exit__(self, *args):
        self.end = time.clock()
        print('{:0.3f}s {:s}'.format(self.end - self.start, self.msg))


# ### CPU / RAM
# 
# This is a pure numerical computation on values in memory that computes the [eigenvalues](http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.linalg.eig.html) of a random dataset.

# In[11]:


# Initialise the same data every time
np.random.seed(0)
data = np.random.random((1000, 1000))

# Time the computation
with Timer('computation'):
    np.linalg.eig(data)


# ### Disk
# 
# Let's time sequential writes and reads on the disk.
# 
# The best way to do this is via [disktt](https://www.google.com/search?q=disktt) on Windows and [dd on Linux](https://www.thomas-krenn.com/en/wiki/Linux_I/O_Performance_Tests_using_dd).
# 
# Below is a crude approximation in Python. Note: this is heavily influenced by OS disk caching.

# In[12]:


# Change this to any folder in the drive you want to test
folder = Path('D:/')


# In[13]:


# Test the speed of the hard disk at this folder
# ... with this string data
data = bytes('0123456789') * 100000000

# Run the test
path = folder / 'tempfile'
with path.open(mode='wb', buffering=0) as handle:
    with Timer('sequential disk write'):
        handle.write(data)
        
with path.open(mode='rb', buffering=0) as handle:
    with Timer('sequential disk read'):
        handle.read()
        
path.unlink()


# ### Database
# 
# This script tests the speed of MySQL. It assumes that a MySQL instance running on localhost and a database called `test` accessible to user `root` with no password. You can [change the connection string](http://docs.sqlalchemy.org/en/latest/core/engines.html#mysql) based on your configuration.
# 

# In[14]:


# This is for a local MySQL database called test that you can connect to as root with no password
engine = sqlalchemy.create_engine('mysql+pymysql://root@localhost/dbtest')

# Test the connection
connection = engine.connect()


# In[15]:


# Setup the data structures
data = pd.DataFrame(np.random.randint(0, 1000, (1000000, 3)))

metadata = sqlalchemy.MetaData(bind=engine)
metadata.reflect()

# Drop benchmark table
if 'benchmark' in metadata.tables:
    metadata.tables['benchmark'].drop()

# Create benchmark table again as MyISAM
table = sqlalchemy.Table(
    'benchmark', metadata,
    sqlalchemy.Column('0', sqlalchemy.Integer),
    sqlalchemy.Column('1', sqlalchemy.Integer),
    sqlalchemy.Column('2', sqlalchemy.Integer),
    extend_existing=True,
    mysql_engine='MyISAM',
)
metadata.create_all()


# In[16]:


with Timer('database write'):
    data.to_sql('benchmark', con=engine, if_exists='append', index=False)
    
with Timer('database read'):
    data = pd.read_sql('benchmark', con=engine)


# # Group by categories
# 
# Pandas processes categories much faster than text. For this sample:
# 
#                         text    categories
#     .groupby() time    591ms          74ms    Categories are ~8X faster in this case
#     Memory usage       512MB          86MB    Categories are much smaller (based on text length)

# In[56]:


import pandas as pd

cat1 = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Zeta', 'Eta']


# In[57]:


n = 10000000
text = pd.DataFrame({
    'cat1': pd.Series(pd.np.random.randint(0, len(cat1), n)).map(dict(enumerate(cat1))),
    'val': pd.np.random.rand(n)
})
cats = text.copy()
cats['cat1'] = cats['cat1'].astype('category')


# In[58]:


get_ipython().run_line_magic('timeit', "text.groupby('cat1')['val'].sum()")
get_ipython().run_line_magic('timeit', "cats.groupby('cat1')['val'].sum()")


# In[59]:


text.info(memory_usage='deep')
cats.info(memory_usage='deep')


# # Database groupby
# 
# Observations on the performance of a `SELECT category, COUNT(id) FROM table GROUP BY category` query:
# 
# ### MySQL
# 
# - Creating a hash / btree index on `category` does not speed it up (1.5s)
# - Using `VARCHAR` instead of `TEXT` does not speed it up
# - In-memory tables improve performance by just 20-40% (1.2s)
# - InnoDB tables are 60X times than MyISAM slower without an index (90s)
# - MySQL intelligently caches queries (which is good). Use `reset query cache` to reset the cache
# 
# To move a large table into an in-memory table:
# 
#     SET GLOBAL tmp_table_size = 1024 * 1024 * 1024 * 2;
#     SET GLOBAL max_heap_table_size = 1024 * 1024 * 1024 * 2;
# 
#     # Now disconnect and reconnect
# 
#     CREATE TABLE mem LIKE inr_import;
#     ALTER TABLE mem ENGINE=MEMORY;
#     INSERT INTO mem SELECT * FROM inr_import;
# 
# 
# PostgreSQL:
# 
# - Creating a hash index on `category` does not speed it up

# In[ ]: