A Python wrapper for MADlib - an open source library for scalable in-database machine learning algorithms
from pymadlib.pymadlib import *
from pymadlib import example
from pymadlib.example import *
conn = DBConnect()
conn.conn
<connection object at 0x1022ad0a0; dsn: 'host='localhost' port ='5432' dbname='vatsandb' user='gpadmin' password=xxxxxxxxxxx', closed: 0>
#View Documentation
mdl = LinearRegression(conn)
print(mdl.train.__doc__)
Given train a linear regression model on the specified table for the given set of independent and dependent variables Inputs : ======== table_name : (String) input table name indep : (list of strings) the independent variables to be used to build the model on dep : (string) the class label Output : ======== The Model coefficients, r2, p_values and t_stats The function also returns the model object.
#Train Model and Score
lreg = LinearRegression(conn)
lreg.train('public.wine_training_set',['1','alcohol','proline','hue','color_intensity','flavanoids'],'quality')
cursor = lreg.predict('public.wine_test_set','quality')
#Print Prediction Results
rowset = conn.printTable(cursor,['id','quality','prediction'])
statement : select (madlib.linregr(quality,array[1,alcohol,proline,hue,color_intensity,flavanoids])).* from public.wine_training_set ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- std_err | [176.45127024525, 34.9120997622056, 40.7792991586509, 7.79463508830726, 33.7169283855118, 42.0751829713439] condition_no | 6780.55233691 r2 | 0.667006811765 coef | [1416.22464229863, -361.192816094753, -88.8425299895989, 58.4638933049179, -23.0330507155227, 1.19714448390368] p_values | [7.08655356618237e-13, 0.0, 0.031282488716348, 1.13492548692307e-11, 0.495819848493887, 0.977347736470074] t_stats | [8.0261515846852, -10.3457774970546, -2.17861836330142, 7.50052986991265, -0.683130160973382, 0.0284525080905536] ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------- id | quality | prediction -------------------------------------------------------------------------------- 32 | 630.0 | 550.955896378 46 | 392.0 | 711.14959836 40 | 550.0 | 386.660785675 2 | 355.0 | 646.727206469 34 | 520.0 | 539.247125745 22 | 1035.0 | 968.434422731 16 | 840.0 | 700.782611847 48 | 680.0 | 498.484868526 10 | 880.0 | 593.444784403 4 | 428.0 | 701.318163157 30 | 1120.0 | 1098.35082555 24 | 1295.0 | 1006.53949356 18 | 407.0 | 506.590887785 42 | 550.0 | 739.476576567 36 | 1510.0 | 1058.28127122 50 | 1105.0 | 1050.27399993 12 | 515.0 | 760.553958893 44 | 1095.0 | 1012.94448829 6 | 920.0 | 1005.30491066 38 | 352.0 | 634.38564086 26 | 725.0 | 629.40588069 20 | 990.0 | 982.077078581 14 | 480.0 | 806.207940698 8 | 466.0 | 614.466643095 28 | 750.0 | 590.516592503 31 | 495.0 | 479.127750397 25 | 1260.0 | 1255.48760939 19 | 735.0 | 1008.71905088 13 | 315.0 | 527.815455437 37 | 1515.0 | 1161.07556272 45 | 1285.0 | 1011.64890575 7 | 520.0 | 547.024941921 39 | 675.0 | 772.617235661 1 | 1195.0 | 1019.59951043 33 | 472.0 | 644.038324594 27 | 1480.0 | 1158.94038153 21 | 885.0 | 1008.9016943 15 | 345.0 | 593.393530846 9 | 385.0 | 559.086925745 3 | 937.0 | 539.633692265 29 | 1680.0 | 1274.24084854 23 | 420.0 | 660.143455029 47 | 1065.0 | 987.11175108 41 | 495.0 | 653.476777911 35 | 714.0 | 604.747849941 17 | 1065.0 | 989.334903002 49 | 1045.0 | 1063.58939695 11 | 870.0 | 678.149069548 43 | 770.0 | 962.67329216 5 | 438.0 | 530.141533136 --------------------------------------------------------------------------------
#Generate Scatter Plot
cols = conn.fetchColumns(rowset,['quality','prediction'])
actual = cols['quality']
predicted = cols['prediction']
scatterPlot(actual,predicted, 'wine_test_set')
#Train Linear Regression Model on a mixture of Numeric and Categorical Variables
lreg.train('public.auto_mpg_train',['1','height','width','length','highway_mpg','engine_size','make','fuel_type','fuel_system'],'price')
cursor = lreg.predict('public.auto_mpg_test','price')
#Show prediction results
rowset = conn.printTable(cursor,['id','price','prediction'])
statement : select (madlib.linregr(price,array[1,height,width,length,highway_mpg,engine_size,make_val_0,make_val_1,make_val_2,make_val_3,make_val_4,make_val_5,make_val_6,make_val_7,make_val_8,make_val_9,make_val_10,make_val_11,make_val_12,make_val_13,fuel_type_val_0,fuel_system_val_0,fuel_system_val_1,fuel_system_val_2,fuel_system_val_3,fuel_system_val_4])).* from public.gp_pymdlib_auto_mpg_train ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- std_err | [16613.1097084184, 124.479943337153, 242.264831832431, 37.1403629738067, 65.2804356109868, 16.0849662777181, 1166.34436370139, 1680.81795753747, 1315.51488977053, 1849.54130588577, 990.837833073393, 1294.15928209115, 1240.69628282232, 961.591952063424, 1225.18184104948, 1373.47489698111, 918.984647877449, 1018.3496214136, 832.622520380731, 1006.3833436486, 732.083068237177, 2063.18998614013, 1118.7258211047, 732.083068236985, 2126.45109180399, 1255.53183959977] condition_no | inf r2 | 0.95028645716 coef | [-14135.0920725423, -218.887424625356, 717.968847393939, -11.139007393805, -243.07773813379, 33.0294768017193, -732.787478989165, -2017.41165462347, -5769.49444754779, -5695.78114864106, -5768.32713194712, 6735.28199665331, -7112.12879006049, -4482.12569945376, -685.62036397255, -6080.09602732393, -1658.14370847461, -6197.85392583956, -4920.1509052387, -4553.60737000622, 584.594863615988, 371.30276372662, -261.038398801291, 584.594863615843, -1134.05798918608, -337.19065615657] p_values | [0.397366696196921, 0.0824540440447666, 0.00399174004260638, 0.765009172098666, 0.000361599170050519, 0.0432569143071544, 0.5315899680881, 0.23353858685713, 3.44696344571904e-05, 0.00283038879547226, 1.12632038917759e-07, 1.44037678795339e-06, 1.63958805532438e-07, 1.22041508766602e-05, 0.577291726445808, 2.95877567791925e-05, 0.0748979930970457, 3.65996161155735e-08, 7.778490374033e-08, 2.04950519373526e-05, 0.426894516148574, 0.857629216185837, 0.81608959533545, 0.426894516148567, 0.595279238124451, 0.788948040814391] t_stats | [-0.85083962729624, -1.75841520133489, 2.96357024650834, -0.29991649251411, -3.72359246470592, 2.05343774002646, -0.628277121058541, -1.2002558906374, -4.3857310110372, -3.07956417654229, -5.82166620955001, 5.20436865064261, -5.73236890327578, -4.66115142689769, -0.559607024035919, -4.42679807303939, -1.80432144574381, -6.08617492019698, -5.90922150771142, -4.52472449861633, 0.798536244013492, 0.179965377023404, -0.233335455280299, 0.798536244013503, -0.533310168081035, -0.268564002537806] ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------- id | price | prediction -------------------------------------------------------------------------------- 31 | 20970.0 | 16807.7980042 25 | 6479.0 | 3600.97329712 19 | 7999.0 | 7531.80499268 13 | 22018.0 | 20119.8197327 37 | 21105.0 | 16807.7980042 57 | 6229.0 | 6548.61895752 51 | 11549.0 | 11891.4570618 45 | 9279.0 | 8833.89758301 7 | 6377.0 | 6504.84161377 39 | 15985.0 | 17221.686554 1 | 7995.0 | 7425.13757324 33 | 9258.0 | 8483.48651123 27 | 10245.0 | 9702.81317139 21 | 9988.0 | 10849.5627747 53 | 9995.0 | 9718.19937134 15 | 8845.0 | 10096.8102417 9 | 7975.0 | 9232.04360962 3 | 7129.0 | 8080.17727661 29 | 5572.0 | 5465.00665283 23 | 10898.0 | 10630.6750793 47 | 10698.0 | 11497.6240234 41 | 16630.0 | 17590.2693176 35 | 6575.0 | 8618.56866455 55 | 16558.0 | 15602.0802307 17 | 7689.0 | 7632.10314941 49 | 8845.0 | 9168.08169556 11 | 7775.0 | 7425.13757324 43 | 32250.0 | 25620.8760071 5 | 7295.0 | 6681.63418579 32 | 15690.0 | 14232.9558411 52 | 7895.0 | 9431.19009399 46 | 14399.0 | 15085.3013 40 | 22625.0 | 19324.6885071 2 | 8948.0 | 10144.5193176 34 | 16515.0 | 16937.1338196 22 | 31600.0 | 30635.0737305 54 | 8238.0 | 8213.34381104 16 | 7395.0 | 5998.0725708 48 | 9980.0 | 10378.5294495 10 | 13499.0 | 14574.9560852 4 | 6989.0 | 8483.05767822 30 | 17199.0 | 16916.9359436 24 | 9960.0 | 8666.22402954 18 | 13415.0 | 16937.1338196 42 | 8449.0 | 11891.4570618 36 | 18280.0 | 11421.3580627 56 | 18150.0 | 18134.1106262 50 | 8013.0 | 8928.45709229 12 | 22470.0 | 20477.0307617 44 | 10595.0 | 10096.8102417 6 | 6855.0 | 7490.21743774 38 | 5389.0 | 4929.81427002 26 | 7957.0 | 8664.13586426 20 | 8921.0 | 7607.48773193 14 | 9538.0 | 9352.58071899 8 | 28248.0 | 29234.9177246 28 | 16430.0 | 14715.0697327 --------------------------------------------------------------------------------
#Display Scatter Plot of Actual Vs Predicted Values
cols = conn.fetchColumns(rowset,['price','prediction'])
actual = cols['price']
predicted = cols['prediction']
scatterPlot(actual,predicted, 'auto_mpg_test')
#1) Logistic Regression with Numeric Variables Alone
log_reg = LogisticRegression(conn)
#Train Model
log_reg.train('public.wine_bool_training_set','indep','quality_label')
#Scoring
cursor = log_reg.predict('wine_bool_test_set','',None)
#Display ROC Curve
cols = conn.fetchColumns(cursor,['quality_label','prediction'])
actual = cols['quality_label']
predicted = cols['prediction']
ROCPlot('ROC curve Logistic Reg. on Continuous Features ',['Logistic Regression'],actual,predicted)
statement : select * from madlib.logregr('public.wine_bool_training_set','quality_label','indep',100, 'irls', 0.001) ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- std_err | [2.62370220266303, 0.645203905535534, 0.583811938681368, 0.126975428265817, 0.48793731244426, 0.567356075421763] odds_ratios | [39.3036672395389, 0.0948315596232434, 0.693224535777532, 1.76962044606699, 0.864158253007184, 1.07564201713425] condition_no | 6198.90723196 log_likelihood | -59.6640513625 coef | [3.67131782850492, -2.35565301770436, -0.366401326819681, 0.570765086340494, -0.145999363747654, 0.0729177085527688] p_values | [0.161726314395655, 0.000261199864726505, 0.530264937852516, 6.95428226814735e-06, 0.764774127325118, 0.897735929525397] num_iterations | 6 z_stats | [1.39928907510104, -3.65102101443282, -0.62760163426472, 4.49508297893372, -0.299217460981389, 0.128521948934032] ---------------------------------------------------------------------------------------------------------------------------------------------------------------- statement: select *, (1.0/(1.0 + exp(-1.0*madlib.array_dot(indep, array[3.67131782850492, -2.35565301770436, -0.366401326819681, 0.570765086340494, -0.145999363747654, 0.0729177085527688]::real[])))) as prediction from wine_bool_test_set
#2) Logistic Regression with Mixture of Numeric and Categorical Variables
#Train Model
log_reg.train('public.auto_mpg_bool_train',['1','height','width','length','highway_mpg','engine_size','make','fuel_type','fuel_system'],'is_expensive')
#Scoring
cursor = log_reg.predict('auto_mpg_bool_test','is_expensive',None)
#Display ROC Curve
cols = conn.fetchColumns(cursor,['is_expensive','prediction'])
actual = cols['is_expensive']
predicted = cols['prediction']
ROCPlot('ROC curve Logistic Reg. including categorical data',['Logistic Regression'],actual,predicted)
statement : select * from madlib.logregr('public.gp_pymdlib_arr_auto_mpg_bool_train','is_expensive','indep',100, 'irls', 0.001) ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- std_err | [8260.75052933305, 1.03652077514348, 93.3156998618529, 32.3727321255478, 35.1587803852847, 11.7764652602903, 1048.72207606614, 1817.9721617305, 1142.9462294548, 2140.62154979112, 627.658988616015, 1212.25303818268, 1148.5925747908, 905.51831911587, 10179.997776188, 1175.40540655678, 971.023939939771, 589.594381627054, 576.867235921955, 884.128262263986, 5102.22372651156, 2366.61383087985, 1219.955835445, 5102.22372651264, 2482.2561061022, 1255.91741854684] odds_ratios | [0.0, 1.42137730071846, 819.356599462108, 204.275499512008, 0.00194264365415406, 0.368185820031342, 5092576381042.13, 1.09433309076233e+106, 1.47610865163205e+58, 3.04496896416429e+58, 1.82050722710836e+37, 1.04409484236975e+24, 9.65451082068503e+45, 7.72475633489319e+49, 3.47861111653924, 1.5045391289412e+66, 0.225937808714701, 7.9639065247808e+30, 5.26718987130233e+29, 2.8151350501707e+29, 3.29595046033607e+21, 2730575249.71417, 36523614262.71, 3.29595046071977e+21, 42.9493516072627, 3.48935985369781e+23] condition_no | inf log_likelihood | -4.62431873772 coef | [-1199.53601757484, 0.351626331618349, 6.70851939746899, 5.3194695705597, -6.24370552522441, -0.999167522518064, 29.2588049836668, 244.164164985559, 133.939344729341, 134.663426102372, 85.7947635992532, 55.3051925623731, 105.883754433424, 114.871099836586, 1.24663310968026, 152.379102762294, -1.48749550025786, 71.1524724387345, 68.4364646859103, 67.8099779324621, 49.5469815345714, 21.7277781381272, 24.3212248544442, 49.5469815346878, 3.76002155186143, 54.2091754352441] p_values | [0.884545793953474, 0.734431101157513, 0.942688990038906, 0.869479670122989, 0.859048156976056, 0.932385068122934, 0.977742319839157, 0.893160784707686, 0.906711241024046, 0.949839315887018, 0.89127576309419, 0.963611675683649, 0.926550538206978, 0.899053788285458, 0.999902291795089, 0.896851560553204, 0.998777734326712, 0.903944202018277, 0.905564782173481, 0.938864597823002, 0.992251976314663, 0.992674759629806, 0.984094305819195, 0.992251976314647, 0.998791397072257, 0.965571591755914] num_iterations | 15 z_stats | [-0.145209084007005, 0.339237128720044, 0.0718905758345108, 0.164319451009873, -0.177585953119058, -0.0848444334045813, 0.0278994841926276, 0.134305777682064, 0.117187791759225, 0.0629085632233835, 0.136690089929932, 0.0456218221942199, 0.0921856511676549, 0.126856737640321, 0.000122459074853263, 0.129639613628009, -0.0015318834470241, 0.120680377316997, 0.118634688233827, 0.0766969916319847, 0.0097108602425882, 0.00918095629063795, 0.0199361519063291, 0.00971086024260898, 0.00151475971501009, 0.0431630094739564] ---------------------------------------------------------------------------------------------------------------------------------------------------------------- statement: select *, (1.0/(1.0 + exp(-1.0*madlib.array_dot(indep, array[-1199.53601757484, 0.351626331618349, 6.70851939746899, 5.3194695705597, -6.24370552522441, -0.999167522518064, 29.2588049836668, 244.164164985559, 133.939344729341, 134.663426102372, 85.7947635992532, 55.3051925623731, 105.883754433424, 114.871099836586, 1.24663310968026, 152.379102762294, -1.48749550025786, 71.1524724387345, 68.4364646859103, 67.8099779324621, 49.5469815345714, 21.7277781381272, 24.3212248544442, 49.5469815346878, 3.76002155186143, 54.2091754352441]::real[])))) as prediction from public.gp_pymdlib_arr_auto_mpg_bool_test
#Demonstrate K-Means
example.kmeansDemo(conn)
KMeans with random cluster initialization statement : select * from madlib.kmeans_random( 'public.wine_bool_training_set', 'indep', 3, 'madlib.squared_dist_norm2', 'madlib.avg', 20, 0.001 ); ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- num_iterations | 10 objective_fn | 285.251101444 centroids | [[1.0, 2.0, 2.77145454233343, 2.96581817106767, 1.5658181802793, 2.2052727244117], [1.0, 3.0, 1.67380951699756, 9.06999999000913, 1.32095237289156, 1.79333332039061], [1.0, 1.57692307692308, 2.75711538699957, 5.62903844393217, 1.65884615251651, 2.56576922994394]] frac_reassigned | 0.0 ---------------------------------------------------------------------------------------------------------------------------------------------------------------- KMeans Plus Plus statement : select * from madlib.kmeans_random( 'public.wine_bool_training_set', 'indep', 3, 'madlib.squared_dist_norm2', 'madlib.avg', 20, 0.001 ); ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Model Parameters ---------------------------------------------------------------------------------------------------------------------------------------------------------------- num_iterations | 5 objective_fn | 292.842264034 centroids | [[1.0, 2.15714285714286, 2.5991428562573, 3.43871427263532, 1.44585713957037, 2.09742856706892], [1.0, 1.10810810810811, 3.07729729768392, 5.81405403807357, 1.92351351557551, 2.91594594878119], [1.0, 3.0, 1.67380951699756, 9.06999999000913, 1.32095237289156, 1.79333332039061]] frac_reassigned | 0.0 ---------------------------------------------------------------------------------------------------------------------------------------------------------------- Couldn't import dot_parser, loading of dot files will not be possible.
networkVizDemo()