#!/usr/bin/env python # coding: utf-8 # # Updating a dictionary # # Often, we want to create a copy of a dictionary and update it. For example, we have a default set of parameters. We want to update them, but without disturbing the original parameter list. # # There are 2 ways of doing this. # # 1. Make a copy of the dictionary and update it with the new dictionary. **This is twice as fast** # 2. Add the `.items()` list if both dictionaries, and make a new `dict` out of it. # # Here is the benchmark: # # update append # 1. make a copy and update 1.58µs 1.66µs # 2. add .items() and dictify 2.81µs 3.36µs # In[1]: base = {x:x for x in range(20)} same = {x:x for x in range(20)} incr = {x:x for x in range(20, 40)} get_ipython().run_line_magic('timeit', 'y=dict(base); y.update(same)') get_ipython().run_line_magic('timeit', 'y=dict(base); y.update(incr)') base = base.items() same = same.items() incr = incr.items() get_ipython().run_line_magic('timeit', 'dict(base + same)') get_ipython().run_line_magic('timeit', 'dict(base + incr)') # # Large string creation # # Array joins are faster than successive appending # # 1,000 10,000 100,000 1,000,000 # appending 226µs 2.87ms 15.8ms 484ms # array joins 116µs 1.11ms 11.2ms 146ms # In[4]: def string_append(s, count): result = '' for x in range(count): result += s def array_join(s, count): result = [] for x in range(count): result.append(s) result = ''.join(result) for count in [1000, 10000, 100000, 1000000]: print count, 'concatenations' get_ipython().run_line_magic('timeit', "string_append('abc', count)") get_ipython().run_line_magic('timeit', "array_join('abc', count)") # # Date parsing # # Date parsing in Python is quite slow, especially for large arrays. # Here's a benchmark of various approaches. # # to_datetime: 7740 ms # dateutil: 6970 ms # strptime: 1660 ms # manual: 253 ms # lookup: 9 ms # # Manual string-array-based parsing of dates is significantly faster. # If there aren't many dates, lookups are *MUCH* faster. # In[2]: import time import datetime import dateutil.parser import pandas as pd s = pd.Series(['01-31-2012']*100000) # In[5]: # Use Pandas' built-in to_datetime get_ipython().run_line_magic('timeit', 'pd.to_datetime(s)') # In[6]: # Use dateutil.parser get_ipython().run_line_magic('timeit', 's.apply(dateutil.parser.parse)') # In[7]: # Parse using datetime.strptime get_ipython().run_line_magic('timeit', "s.apply(lambda v: datetime.datetime.strptime(v, '%m-%d-%Y'))") # In[8]: # Manually parse the date get_ipython().run_line_magic('timeit', 's.apply(lambda v: datetime.datetime(int(v[6:10]), int(v[0:2]), int(v[3:5])))') # In[9]: def lookup(s): """ This is an extremely fast approach to datetime parsing. For large data, the same dates are often repeated. Rather than re-parse these, we store all unique dates, parse them, and use a lookup to convert all dates. """ return s.map({date:pd.to_datetime(date) for date in s.unique()}) get_ipython().run_line_magic('timeit', 'lookup(s)') # # Mean vs Median # # Mean is *much (~100 times) faster* to calculate than median. # In[9]: import time import numpy data = numpy.random.rand(50000000) # In[10]: timeit numpy.mean(data) # In[11]: timeit numpy.median(data) # # Reading data # # HDF5 is the fastest way of reading tabular data. # # csv.DictReader: 2.78 s # pickle: 2.41 s # json: 2.39 s # json-array: 799 ms # csv.reader: 478 ms # pd.read_csv 355 ms # pd.read_pickle: 319 ms # pd.read_hdf (table) 169 ms # pd.read_hdf (stored) 123 ms # In[1]: # First, create a set of data files words = 'ad adipisicing aliqua aliquip amet anim aute cillum commodo consectetur consequat culpa cupidatat deserunt do dolor dolore duis ea eiusmod elit enim esse est et eu ex excepteur exercitation fugiat id in incididunt ipsum irure labore laboris laborum lorem magna minim mollit nisi non nostrud nulla occaecat officia pariatur proident qui quis reprehenderit sed sint sit sunt tempor ullamco ut velit veniam voluptate'.split() # Create the data in memory data = [] for row in range(0, 1000000): data.append({ 'A': words[row % len(words)], 'B': chr(64 + (row % 62)), 'C': row, 'D': row + 1, 'E': row + 2, 'F': row + 3, }) # Save CSV import csv keys = sorted(data[0].keys()) out = csv.DictWriter(open('sample.data.csv', 'w'), fieldnames=keys, lineterminator='\n') out.writerow(dict(zip(keys, keys))) out.writerows(data) # Save JSON import json json.dump(data, open('sample.data.json', 'w'), separators= (',', ':')) # Save JSON-array import json json.dump([data[0].keys()] + [row.values() for row in data], open('sample.data-array.json', 'w'), separators= (',', ':')) # Save pickle import cPickle as pickle pickle.dump(data, open('sample.data.pickle', 'wb'), pickle.HIGHEST_PROTOCOL) # Save pandas pickle import pandas as pd df = pd.DataFrame(data, columns=data[0].keys()) df.to_pickle('sample.data.pandas') # Save HDF5 df.to_hdf('sample.data.h5', 'stored') df.to_hdf('sample.data.h5', 'table', table=True) # In[2]: import time import csv import json import cPickle as pickle import pandas as pd get_ipython().run_line_magic('timeit', "list(csv.DictReader(open('sample.data.csv')))") get_ipython().run_line_magic('timeit', "pickle.load(open('sample.data.pickle', 'rb'))") get_ipython().run_line_magic('timeit', "json.load(open('sample.data.json'))") get_ipython().run_line_magic('timeit', "json.load(open('sample.data-array.json'))") get_ipython().run_line_magic('timeit', "list(csv.reader(open('sample.data.csv')))") get_ipython().run_line_magic('timeit', "pd.read_csv('sample.data.csv')") get_ipython().run_line_magic('timeit', "pd.read_pickle('sample.data.pandas')") get_ipython().run_line_magic('timeit', "pd.read_hdf('sample.data.h5', 'table')") get_ipython().run_line_magic('timeit', "pd.read_hdf('sample.data.h5', 'stored')") # # Templates vs lxml vs cElementTree # # This is the time taken to generate a bar chart, in µs. The output could either be xml (etree) or text. # # xml text # template 68 35 # lxml 73 87 # cElementTree 23 247 # # For string output, tornado templates are extremely fast. Even for etree, it's faster than using lxml directly. However, if you only want etree output and not string, cElementTree is faster. # # To me, the template approach with lxml.fromstring appears optimal. # In[1]: from IPython.display import HTML from tornado import template using_template = template.Template(''' {% for i, x in enumerate(series) %} {% end %} ''', autoescape=None).generate HTML(using_template(series=[1,2,3,4,3,2,1])) # In[2]: from lxml import etree def using_lxml(series): root = etree.Element('svg', width="100", height="50") for i, x in enumerate(series): rect = etree.SubElement(root, 'rect', x = '%d' % (10 * i), width = '10', y = '%d' % (50 - 10 * x), height = '%d' % (10 * x), fill = '#88f', stroke = '#fff') return root HTML(etree.tostring(using_lxml(series=[1,2,3,4,3,2,1]))) # In[3]: import xml.etree.cElementTree as cElementTree def using_cElementTree(series): root = cElementTree.Element('svg', width="100", height="50") for i, x in enumerate(series): rect = cElementTree.SubElement(root, 'rect', x = '%d' % (10 * i), width = '10', y = '%d' % (50 - 10 * x), height = '%d' % (10 * x), fill = '#88f', stroke = '#fff') return root HTML(cElementTree.tostring(using_cElementTree(series=[1,2,3,4,3,2,1]))) # In[6]: # Create etree output get_ipython().run_line_magic('timeit', 'etree.fromstring(using_template(series=[1,2,3,4,3,2,1]))') get_ipython().run_line_magic('timeit', 'using_lxml(series=[1,2,3,4,3,2,1])') get_ipython().run_line_magic('timeit', 'using_cElementTree(series=[1,2,3,4,3,2,1])') # Create string output get_ipython().run_line_magic('timeit', 'using_template(series=[1,2,3,4,3,2,1])') get_ipython().run_line_magic('timeit', 'etree.tostring(using_lxml(series=[1,2,3,4,3,2,1]))') get_ipython().run_line_magic('timeit', 'cElementTree.tostring(using_cElementTree(series=[1,2,3,4,3,2,1]))') # # Range search # # Here, we're trying to find where a value fits in a list of numbers. For example, in the list [1, 3, 7, 9], the number 4 would be just after the 2nd element 3. # # The summary is: use `numpy.searchsorted()` -- it's blazingly fast. # # 37,000 µs For loop # 5,790 µs Numpy filtering # 2,270 µs Numpy filtering on sorted values # 1,850 µs Numpy index search on sorted values # 1 µs numpy.searchsorted() # # Having read [this post](http://blog.clifreeder.com/blog/2013/04/21/ruby-is-too-slow-for-programming-competitions/) on Ruby being slow, I thought I'd check the same with Python. I got it running fairly fast, but there was one piece that was taking a fair bit of time: *counting numbers in a range*. Here's the slow version: # In[1]: values = range(1000000) def count(values, a, b): count = 0 for x in values: if a <= x <= b: count += 1 return count get_ipython().run_line_magic('timeit', 'count(values, 250000, 750000)') # Of course, running a loop in Python for numbers is never a good idea. Let's move this to NumPy. # In[2]: values = numpy.random.rand(1000000) get_ipython().run_line_magic('timeit', '((.25 <= values) & (values <= .75)).sum()') # That's not bad, but it could get a lot better. First, let's sort the values and try it. # In[3]: values.sort() get_ipython().run_line_magic('timeit', '((.25 <= values) & (values <= .75)).sum()') # Just like that, it's faster. But we can do much better. Given that it's already sorted, what if we just found the index? # In[4]: get_ipython().run_line_magic('timeit', '(values <= .75).argmin() - (.25 <= values).argmax()') # A bit faster. It's wasteful of memory, though -- having to create 2 new arrays just to find the position of these two numbers. What if we searched for these? # In[5]: get_ipython().run_line_magic('timeit', 'numpy.searchsorted(values, .75) - numpy.searchsorted(values, .25)') # That's 1.45 *micro*seconds. It's *25 thousand* times faster than the original code, and *four thousand* times faster than the original NumPy code. # # If there's one thing I keep re-learning, it's that there's always a faster way of doing it, and if you really want to, you'll probably find it. # # Next power of 10 # # The next power of 10 for 4 is 10^1. For 40, it's 10^2. For 400, it's 10^3. For 0.04, it's 10^-1. And so on. # # Most methods of calculating it are fast enough. # In[1]: data = pd.Series(10 ** (6 * np.random.rand(10000) - 3)) # In[2]: def iterative(v): i = 1 if v > 1: n = 0 while i < v: i, n = i * 10, n + 1 else: n = 1 while i > v: i, n = i / 10., n - 1 return n get_ipython().run_line_magic('timeit', 'data.apply(iterative)') # In[3]: get_ipython().run_line_magic('timeit', 'numpy.ceil(numpy.log10(data))') # In[4]: get_ipython().run_line_magic('timeit', 'data.apply(lambda v: numpy.ceil(numpy.log10(v)))') # # Hierarchical subtotals # # Given a DataFrame like this: # # A B C val # X X X 0 # X X Y 1 # X Y X 2 # X Y Y 3 # Y X X 4 # Y X Y 5 # Y Y X 6 # Y Y Y 7 # # ... create a DataFrame like this, with subtotals. # # A B C val level # na na na 28 0 # X na na 6 1 # X X na 1 2 # X X X 0 3 # X X Y 1 3 # X Y na 5 2 # X Y X 2 3 # X Y Y 3 3 # Y na na 22 1 # Y X na 9 2 # Y X X 4 3 # Y X Y 5 3 # Y Y na 13 2 # Y Y X 6 3 # Y Y Y 7 3 # In[57]: data = pd.DataFrame({'A': list('XXXXYYYY'), 'B': list('XXYYXXYY'), 'C': list('XYXYXYXY'), 'val': range(8)}) # In[85]: groups = ['A', 'B', 'C'] def subtotal(data, groups, agg): frames = [] for level in range(1, 1 + len(groups)): frame = data.groupby(groups[:level], sort=False, as_index=False).agg(agg) frame['level'] = level frames.append(frame) df = pd.concat(frames) for group in groups: df[group].fillna('', inplace=True) return df.sort(groups).set_index(groups) print subtotal(data, groups=groups, agg={'val': 'sum'}) # This is faster than the existing `layout.hierarchy` # In[91]: import layout odi = pd.read_csv('d:/site/gramener.com/viz/autolyse/data/odi-batting.csv', dtype={'Runs':float}) groups = ['Weekday', 'Country', 'Player'] agg = {'Runs': 'sum'} get_ipython().run_line_magic('timeit', 'subtotal(odi, groups, agg)') get_ipython().run_line_magic('timeit', "list(layout.hierarchy(odi, groups, agg=agg, size=lambda df: df['Runs'].sum()))") # In[ ]: # stack(series, groupby) # # Numba # # I'm trying to see how fast numba is. `autojit(fn)` makes `fn` faster. `numpy.sum` and `@autojit` take about the same time. Python loops are much slower. # # Looks like @autojit is a decent replacement for `numpy.vectorize`. # In[2]: from numba import autojit def slow_sum(arr): M, N = arr.shape result = 0.0 for i in range(M): for j in range(N): result += arr[i,j] return result fast_sum = autojit(slow_sum) # In[3]: get_ipython().run_line_magic('timeit', 'numpy.sum(numpy.random.rand(1000,1000))') get_ipython().run_line_magic('timeit', 'fast_sum(numpy.random.rand(1000,1000))') get_ipython().run_line_magic('timeit', 'slow_sum(numpy.random.rand(1000,1000))') # # Shifting a Numpy array # # If you have an array like this: `[3,4,5,6,7]` and you want to move it to the right dropping the last, filling left with nans: `[nan,3,4,5,6]`, what's the fastest way? # # Answer: # # result = numpy.roll(array, 1) # result[0] = numpy.nan # In[10]: data = numpy.random.rand(1001) get_ipython().run_line_magic('timeit', 'result = numpy.insert(data, 0, numpy.nan)[:-2]') get_ipython().run_line_magic('timeit', 'result = numpy.roll(data, 1); result[:1] = numpy.nan') # # Voronoi diagrams # # [Voronoi diagrams](http://en.wikipedia.org/wiki/Voronoi_diagram) takes a set of points, and creates polygons enclosing the space closer to each point than any other. This is the dual of [Delaunay triangulation](http://en.wikipedia.org/wiki/Delaunay_triangulation), which matplotlib and scipy provide by default, and can also be created directly on NumPy. # # Here's the speed generating via various methods: # # Method Time (10K) Time (100K) # matplotlib.delaunay.triangulate.Triangulation 16.5ms 222ms # voronoi() using the above 41.9ms 793ms # scipy.spatial.Delaunay 51.4ms 797ms # In[3]: import numpy # In[4]: scale = .9 small = (1 - scale)/2 + scale * numpy.random.rand(2, 10000) large = (1 - scale)/2 + scale * numpy.random.rand(2, 100000) # In[5]: import matplotlib.delaunay.triangulate as tri get_ipython().run_line_magic('timeit', 'tri.Triangulation(*small)') get_ipython().run_line_magic('timeit', 'tri.Triangulation(*large)') # In[6]: def voronoi(X, Y): ''' Return line segments describing the voronoi diagram of X and Y ''' # Get the points X, Y into a matrix P. P = numpy.zeros((X.size+4, 2)) P[:X.size, 0], P[:Y.size, 1] = X, Y # Add four points at (pseudo) "infinity" m = max(numpy.abs(X).max(), numpy.abs(Y).max()) * 1e5 P[X.size:, 0] = -m, -m, +m, +m P[Y.size:, 1] = -m, +m, -m, +m # Delaunay triangulate, and get the circumcenters D = tri.Triangulation(P[:, 0], P[:, 1]) C = D.circumcenters # D.triangle_neighbours = 3 neighbours. # Each neighbourhood represents a line. n = len(C) tgt = D.triangle_neighbors src = (numpy.zeros_like(tgt).T + numpy.arange(n)).T # Remove all -1s positives = tgt >= 0 n = positives.sum() src = src[positives].reshape(n) tgt = tgt[positives].reshape(n) # TODO: Clip to get polygons # -------------------------- # Get areas # --------- # http://www.mathopenref.com/coordpolygonarea.html csrc = C[src] ctgt = C[tgt] areas = csrc[:,0] * ctgt[:,1] - csrc[:,1] * ctgt[:,0] # print areas # Now add up the areas by the indices given in src # Get the circumcenters return numpy.concatenate((C[tgt].reshape(n, 1, 2), C[src].reshape(n, 1, 2)), axis=1) get_ipython().run_line_magic('timeit', 'voronoi(small[0,:], small[1,:])') get_ipython().run_line_magic('timeit', 'voronoi(large[0,:], large[1,:])') # In[8]: from scipy.spatial import Voronoi get_ipython().run_line_magic('timeit', 'Voronoi(small.T)') get_ipython().run_line_magic('timeit', 'Voronoi(large.T)') # # HDF5 vs SQLite3 vs PostgreSQL # # Which has the faster insert performance? Which has the faster read performance? This is specifically on a key-value index. # In[1]: import random words = 'ad adipisicing aliqua aliquip amet anim aute cillum commodo consectetur consequat culpa cupidatat deserunt do dolor dolore duis ea eiusmod elit enim esse est et eu ex excepteur exercitation fugiat id in incididunt ipsum irure labore laboris laborum lorem magna minim mollit nisi non nostrud nulla occaecat officia pariatur proident qui quis reprehenderit sed sint sit sunt tempor ullamco ut velit veniam voluptate'.split() def get_data(size, keylen=5, vallen=10): 'Return an array of random key, int, str combinations' result = [] hi = len(words) - 1 keys = set() for index in range(size): while True: key = ' '.join(words[random.randint(0, hi)] for i in range(keylen)) if key not in keys: break keys.add(key) num = random.randint(0, 10000000) val = ' '.join(words[random.randint(0, hi)] for i in range(vallen)) result.append([key, num, val]) return result # In[2]: import time import sqlite3 def insert_sqlite3(data, drop=True): conn = sqlite3.connect('.test.sqlite3') try: if drop: conn.execute('DROP TABLE IF EXISTS test') conn.execute('CREATE TABLE IF NOT EXISTS test (k TEXT, n INTEGER, v TEXT, PRIMARY KEY(k))') start = time.time() conn.executemany('INSERT INTO test VALUES (?, ?, ?)', data) conn.commit() return time.time() - start finally: conn.close() # In[3]: import time import psycopg2 def insert_postgres(data, drop=True): conn = psycopg2.connect('host=localhost dbname=test user=postgres') try: cur = conn.cursor() if drop: cur.execute('DROP TABLE IF EXISTS test') cur.execute('CREATE TABLE IF NOT EXISTS test (k VARCHAR(70), n INTEGER, v VARCHAR(300), PRIMARY KEY(k))') conn.commit() start = time.time() cur = conn.cursor() cur.executemany('INSERT INTO test VALUES (%s, %s, %s)', data) conn.commit() return time.time() - start finally: cur.close() conn.close() # In[17]: import time import tables class Test(tables.IsDescription): k = tables.StringCol(itemsize=70, pos=0) n = tables.Int16Col(pos=1) v = tables.StringCol(itemsize=300, pos=2) def insert_hdf5(data, drop=True): handle = tables.open_file('.test.h5', mode='w') try: root = handle.root table = handle.create_table(root, 'test', Test) insert = table.row start = time.time() for row in data: insert['k'], insert['n'], insert['v'] = row insert.append() insert.append() table.flush() return time.time() - start finally: handle.close() # In[19]: print(' size sqlite3 postgres hdf5') for size in (1, 10, 100, 1000, 10000, 25000): print( '% 6d' % size, '% 7.1fms' % (insert_sqlite3(get_data(size), drop=True) * 1000), '% 7.1fms' % (insert_postgres(get_data(size), drop=True) * 1000), '% 7.1fms' % (insert_hdf5(get_data(size), drop=True) * 1000) ) # # How fast is data access? # # When it comes to data, the performance of the CPU/Memory, disk and database are key. This script measures how fast your system performs on these parameters. # # The whole script should run under a minute on most reasonably fast systems. # In[1]: from __future__ import print_function import io import time import sqlalchemy import numpy as np import pandas as pd from pathlib import Path class Timer: def __init__(self, msg): self.msg = msg def __enter__(self): self.start = time.clock() def __exit__(self, *args): self.end = time.clock() print('{:0.3f}s {:s}'.format(self.end - self.start, self.msg)) # ### CPU / RAM # # This is a pure numerical computation on values in memory that computes the [eigenvalues](http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.linalg.eig.html) of a random dataset. # In[11]: # Initialise the same data every time np.random.seed(0) data = np.random.random((1000, 1000)) # Time the computation with Timer('computation'): np.linalg.eig(data) # ### Disk # # Let's time sequential writes and reads on the disk. # # The best way to do this is via [disktt](https://www.google.com/search?q=disktt) on Windows and [dd on Linux](https://www.thomas-krenn.com/en/wiki/Linux_I/O_Performance_Tests_using_dd). # # Below is a crude approximation in Python. Note: this is heavily influenced by OS disk caching. # In[12]: # Change this to any folder in the drive you want to test folder = Path('D:/') # In[13]: # Test the speed of the hard disk at this folder # ... with this string data data = bytes('0123456789') * 100000000 # Run the test path = folder / 'tempfile' with path.open(mode='wb', buffering=0) as handle: with Timer('sequential disk write'): handle.write(data) with path.open(mode='rb', buffering=0) as handle: with Timer('sequential disk read'): handle.read() path.unlink() # ### Database # # This script tests the speed of MySQL. It assumes that a MySQL instance running on localhost and a database called `test` accessible to user `root` with no password. You can [change the connection string](http://docs.sqlalchemy.org/en/latest/core/engines.html#mysql) based on your configuration. # # In[14]: # This is for a local MySQL database called test that you can connect to as root with no password engine = sqlalchemy.create_engine('mysql+pymysql://root@localhost/dbtest') # Test the connection connection = engine.connect() # In[15]: # Setup the data structures data = pd.DataFrame(np.random.randint(0, 1000, (1000000, 3))) metadata = sqlalchemy.MetaData(bind=engine) metadata.reflect() # Drop benchmark table if 'benchmark' in metadata.tables: metadata.tables['benchmark'].drop() # Create benchmark table again as MyISAM table = sqlalchemy.Table( 'benchmark', metadata, sqlalchemy.Column('0', sqlalchemy.Integer), sqlalchemy.Column('1', sqlalchemy.Integer), sqlalchemy.Column('2', sqlalchemy.Integer), extend_existing=True, mysql_engine='MyISAM', ) metadata.create_all() # In[16]: with Timer('database write'): data.to_sql('benchmark', con=engine, if_exists='append', index=False) with Timer('database read'): data = pd.read_sql('benchmark', con=engine) # # Group by categories # # Pandas processes categories much faster than text. For this sample: # # text categories # .groupby() time 591ms 74ms Categories are ~8X faster in this case # Memory usage 512MB 86MB Categories are much smaller (based on text length) # In[56]: import pandas as pd cat1 = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Zeta', 'Eta'] # In[57]: n = 10000000 text = pd.DataFrame({ 'cat1': pd.Series(pd.np.random.randint(0, len(cat1), n)).map(dict(enumerate(cat1))), 'val': pd.np.random.rand(n) }) cats = text.copy() cats['cat1'] = cats['cat1'].astype('category') # In[58]: get_ipython().run_line_magic('timeit', "text.groupby('cat1')['val'].sum()") get_ipython().run_line_magic('timeit', "cats.groupby('cat1')['val'].sum()") # In[59]: text.info(memory_usage='deep') cats.info(memory_usage='deep') # # Database groupby # # Observations on the performance of a `SELECT category, COUNT(id) FROM table GROUP BY category` query: # # ### MySQL # # - Creating a hash / btree index on `category` does not speed it up (1.5s) # - Using `VARCHAR` instead of `TEXT` does not speed it up # - In-memory tables improve performance by just 20-40% (1.2s) # - InnoDB tables are 60X times than MyISAM slower without an index (90s) # - MySQL intelligently caches queries (which is good). Use `reset query cache` to reset the cache # # To move a large table into an in-memory table: # # SET GLOBAL tmp_table_size = 1024 * 1024 * 1024 * 2; # SET GLOBAL max_heap_table_size = 1024 * 1024 * 1024 * 2; # # # Now disconnect and reconnect # # CREATE TABLE mem LIKE inr_import; # ALTER TABLE mem ENGINE=MEMORY; # INSERT INTO mem SELECT * FROM inr_import; # # # PostgreSQL: # # - Creating a hash index on `category` does not speed it up # In[ ]: