Written by Ian Huston.
These notes are a short introduction to using the procedural language PL/Python and how this can help speed up calculations by leveraging the power of a massively parallel processing database like the Pivotal Greenplum Database.
Requirements: PostgreSQL or Greenplum DB, PL/Python + NumPy installed on all nodes.
For this notebook you will also need ipython-sql by Catherine Devlin
#Load the ipython-sql magic command
%load_ext sql
First we need to connect to the Greenplum database
# Normally use the following:
# %sql postgres://user:passwd@server/db
# Use a workaround to not display password!
with open("./.config") as conn_config:
conn_info = conn_config.readline().strip()
ip = get_ipython()
ip.magic("%sql " + conn_info)
We will do our work in a separate schema (container inside a database) to keep things tidy.
%%sql
-- Some quick tests of the capability of PL/Python on
-- PostgreSQL and Greenplum DB.
-- Create a schema to do some work in
DROP SCHEMA plp CASCADE;
CREATE SCHEMA plp;
First demonstrate what a User Defined Function looks like in SQL.
%%sql
-- Simple SQL User Defined Function to get started
CREATE FUNCTION plp.times2(INT)
RETURNS INT
AS $$
SELECT 2 * $1;
$$ LANGUAGE sql;
%%sql
--Try it out
SELECT plp.times2(10);
%%sql
-- Test using Python code
CREATE FUNCTION plp.pymax (a integer, b integer)
RETURNS integer
AS $$
if a > b:
return a
return b
$$ LANGUAGE plpythonu;
%%sql
--Test Python code
SELECT plp.pymax(10, 5);
%%sql
-- Create a composite return type
CREATE TYPE plp.named_value AS (
name text,
value integer
);
%%sql
--Simple function which returns a composite object
CREATE OR REPLACE FUNCTION plp.make_pair (name text, value integer)
RETURNS named_value
AS $$
return [ name, value ]
# or alternatively, as tuple: return ( name, value )
# or as dict: return { "name": name, "value": value }
# or as an object with attributes .name and .value
$$ LANGUAGE plpythonu;
%%sql
--Try out the function
-- (See https://en.wikipedia.org/wiki/Zozimus)
SELECT plp.make_pair('Zozimus', 1);
Note that UDFs and PL/X functions allow for multiple function signatures.
%%sql
--Using NumPy inside a PL/Python function
CREATE OR REPLACE FUNCTION plp.make_pair (name text)
RETURNS named_value
AS $$
import numpy as np
a = np.arange(100)
return [name, a[2]]
$$ LANGUAGE plpythonu;
%%sql
--Try it out
SELECT plp.make_pair('Horatio');
Note that the SQL calling syntax affects how the results are returned.
In SQL this is the difference between returning composite types and splitting out results into each subtype (name and value here).
%%sql
SELECT * FROM plp.make_pair('Horatio');
%%sql
--Returning a set of results using SETOF
CREATE OR REPLACE FUNCTION make_pair_sets (name text)
RETURNS SETOF named_value
AS $$
import numpy as np
return ((name, i) for i in np.arange(3))
$$ LANGUAGE plpythonu;
%%sql
--Try it out
SELECT * FROM make_pair_sets('Gerald');
Now we will try to use parallelisation with PL/Python
%%sql
--Set up some data to show parallelisation
DROP TABLE IF EXISTS plp.test_data;
CREATE TABLE plp.test_data AS
SELECT
'a'::text AS name
, generate_series(0,1000000)::float AS x
, generate_series(0,1000000)/100.0 AS y
DISTRIBUTED BY (name);
INSERT INTO plp.test_data
SELECT
'b'::text AS name
, generate_series(0,1000000)::float AS x
, sin(generate_series(0,1000000)/100.0) AS y;
INSERT INTO plp.test_data
SELECT
'c'::text AS name
, generate_series(0,1000000)::float AS x
, 100.0 + sin(generate_series(0,1000000)/100.0) AS y;
%%sql
-- Create a function to find the mean of some numbers
DROP FUNCTION IF EXISTS plp.np_mean(double precision[]);
CREATE OR REPLACE FUNCTION plp.np_mean(value_array double precision[])
RETURNS float
AS $$
import numpy as np
return np.mean(value_array)
$$ LANGUAGE plpythonu;
%%sql
-- Need to pass the numbers as an array using array_agg
SELECT plp.np_mean(array_agg(y)) FROM plp.test_data;
%%sql
-- Now try to do this for each type of data in parallel by grouping
SELECT
name,
plp.np_mean(array_agg(y))
FROM plp.test_data
GROUP BY name ORDER BY name;
%%sql
-- Now try do something even more interesting
DROP FUNCTION IF EXISTS plp.linregr(double precision[]);
CREATE OR REPLACE FUNCTION
plp.linregr(x double precision[], y double precision[])
RETURNS float[]
AS $$
from scipy import stats
return stats.linregress(x, y)
$$ LANGUAGE plpythonu;
%%sql
-- Do linear regression for all data
SELECT plp.linregr(array_agg(x), array_agg(y))
FROM plp.test_data;
%%sql
-- Now do it separately for each 'name'
SELECT name, plp.linregr(array_agg(x), array_agg(y))
FROM plp.test_data
GROUP BY name ORDER BY name;
In this example we have shown how to run models separately for different data using the GROUP BY clause. It is important to have distributed your data in the correct way to utilise the parallel architecture.
For further information see these notes on using PL/R in addition to PL/Python.