Using server-side cursors with PostgreSQL and MySQL

In [1]:
# see http://pynash.org/2013/03/06/timing-and-profiling.html for setup of profiling magics
In [2]:
import sys
sys.path.insert(0, '../src')
import petl; print petl.VERSION
from petl.fluent import etl
import psycopg2
import MySQLdb
0.18-SNAPSHOT
In [3]:
tbl_dummy_data = etl().dummytable(100000)
tbl_dummy_data.look()
Out[3]:
+-------+-----------+----------------------+
| 'foo' | 'bar'     | 'baz'                |
+=======+===========+======================+
|    31 | 'pears'   |  0.12509339133627373 |
+-------+-----------+----------------------+
|    90 | 'oranges' |  0.05715662664829624 |
+-------+-----------+----------------------+
|    12 | 'oranges' |   0.8525934855236975 |
+-------+-----------+----------------------+
|    68 | 'apples'  |    0.911131148945329 |
+-------+-----------+----------------------+
|    77 | 'apples'  |   0.8115001426786242 |
+-------+-----------+----------------------+
|    94 | 'apples'  |   0.6671472950408706 |
+-------+-----------+----------------------+
|    55 | 'apples'  | 0.003432210002982883 |
+-------+-----------+----------------------+
|    50 | 'apples'  |   0.7744929413714756 |
+-------+-----------+----------------------+
|    82 | 'oranges' |  0.46001316056152297 |
+-------+-----------+----------------------+
|    13 | 'bananas' |   0.9602502583307483 |
+-------+-----------+----------------------+
In [4]:
%memit print tbl_dummy_data.nrows()
100000
peak memory: 51.38 MiB, increment: 0.20 MiB

PostgreSQL

In [6]:
psql_connection = psycopg2.connect(host='localhost', dbname='petl', user='petl', password='petl')
In [ ]:
cursor = psql_connection.cursor()
cursor.execute('DROP TABLE IF EXISTS issue_219;')
cursor.execute('CREATE TABLE issue_219 (foo INTEGER, bar TEXT, baz FLOAT);')
In [7]:
%memit -r1 tbl_dummy_data.progress(10000).todb(psql_connection, 'issue_219')
10000 rows in 3.61s (2770 row/s); batch in 3.61s (2770 row/s)
20000 rows in 7.21s (2774 row/s); batch in 3.60s (2778 row/s)
30000 rows in 10.96s (2736 row/s); batch in 3.75s (2663 row/s)
40000 rows in 14.69s (2723 row/s); batch in 3.72s (2685 row/s)
50000 rows in 18.32s (2728 row/s); batch in 3.64s (2748 row/s)
60000 rows in 22.04s (2722 row/s); batch in 3.72s (2689 row/s)
70000 rows in 25.76s (2717 row/s); batch in 3.72s (2687 row/s)
80000 rows in 29.58s (2704 row/s); batch in 3.82s (2617 row/s)
90000 rows in 33.41s (2693 row/s); batch in 3.83s (2613 row/s)
100000 rows in 37.14s (2692 row/s); batch in 3.73s (2680 row/s)
peak memory: 53.32 MiB, increment: 0.01 MiB
100000 rows in 37.14s (2692 row/s)
In [8]:
# memory usage using default cursor
%memit print etl.fromdb(psql_connection, 'select * from issue_219 order by foo').look(2)
+-------+-----------+-------------------+
| 'foo' | 'bar'     | 'baz'             |
+=======+===========+===================+
|     0 | 'pears'   | 0.625346298507174 |
+-------+-----------+-------------------+
|     0 | 'bananas' | 0.191535466509102 |
+-------+-----------+-------------------+

peak memory: 60.89 MiB, increment: 7.32 MiB
In [9]:
# memory usage using server-side cursor
%memit print etl.fromdb(lambda: psql_connection.cursor(name='server-side'), 'select * from issue_219 order by foo').look(2)
+-------+-----------+-------------------+
| 'foo' | 'bar'     | 'baz'             |
+=======+===========+===================+
|     0 | 'pears'   | 0.625346298507174 |
+-------+-----------+-------------------+
|     0 | 'bananas' | 0.191535466509102 |
+-------+-----------+-------------------+

peak memory: 54.38 MiB, increment: 0.00 MiB

MySQL

In [10]:
mysql_connection = MySQLdb.connect(host='127.0.0.1', db='petl', user='petl', passwd='petl')
In [11]:
cursor = mysql_connection.cursor()
cursor.execute('SET SQL_MODE=ANSI_QUOTES')
cursor.execute('DROP TABLE IF EXISTS issue_219;')
cursor.execute('CREATE TABLE issue_219 (foo INTEGER, bar TEXT, baz FLOAT);')
Out[11]:
0L
In [12]:
%memit -r1 tbl_dummy_data.progress(10000).todb(mysql_connection, 'issue_219')
10000 rows in 2.96s (3373 row/s); batch in 2.96s (3373 row/s)
20000 rows in 6.22s (3214 row/s); batch in 3.26s (3069 row/s)
30000 rows in 9.45s (3174 row/s); batch in 3.23s (3097 row/s)
40000 rows in 12.50s (3199 row/s); batch in 3.05s (3278 row/s)
50000 rows in 15.60s (3205 row/s); batch in 3.10s (3229 row/s)
60000 rows in 18.65s (3217 row/s); batch in 3.05s (3280 row/s)
70000 rows in 21.90s (3196 row/s); batch in 3.25s (3072 row/s)
80000 rows in 24.88s (3215 row/s); batch in 2.98s (3353 row/s)
90000 rows in 28.08s (3205 row/s); batch in 3.19s (3130 row/s)
100000 rows in 31.24s (3200 row/s); batch in 3.17s (3158 row/s)
peak memory: 54.77 MiB, increment: 0.01 MiB
100000 rows in 31.24s (3200 row/s)
In [13]:
# memory usage with default cursor
%memit print etl.fromdb(mysql_connection, 'select * from issue_219 order by foo').look(2)
+-------+-----------+-----------+
| 'foo' | 'bar'     | 'baz'     |
+=======+===========+===========+
|    0L | 'bananas' |  0.191535 |
+-------+-----------+-----------+
|    0L | 'bananas' | 0.0228774 |
+-------+-----------+-----------+

peak memory: 79.88 MiB, increment: 25.11 MiB
In [15]:
# memory usage with server-side cursor
%memit print etl.fromdb(lambda: mysql_connection.cursor(MySQLdb.cursors.SSCursor), 'select * from issue_219 order by foo').look(2)
+-------+-----------+-----------+
| 'foo' | 'bar'     | 'baz'     |
+=======+===========+===========+
|    0L | 'bananas' |  0.191535 |
+-------+-----------+-----------+
|    0L | 'bananas' | 0.0228774 |
+-------+-----------+-----------+

peak memory: 80.00 MiB, increment: 0.09 MiB