# Bayesian Logistic Regression¶

Install PYMC3: https://pymc-devs.github.io/pymc3/#installation

In [32]:
import pymc3 as pm
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='ticks', palette='Set2')
import pandas as pd
import numpy as np
import math
from theano import tensor
from __future__ import division

X = data.data[:100, :2]
y = data.target[:100]
X_full = data.data[:100, :]

setosa = plt.scatter(X[:50,0], X[:50,1], c='b')
versicolor = plt.scatter(X[50:,0], X[50:,1], c='r')
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.legend((setosa, versicolor), ("Setosa", "Versicolor"))
sns.despine()

In [2]:
basic_model = pm.Model()
X1 = X[:, 0]
X2 = X[:, 1]

with basic_model:

# Priors for unknown model parameters
alpha = pm.Normal('alpha', mu=0, sd=10)
beta = pm.Normal('beta', mu=0, sd=10, shape=2)

# Expected value of outcome
mu = alpha + beta[0]*X1 + beta[1]*X2
logit = 1 / (1 + tensor.exp(-mu))

# Likelihood (sampling distribution) of observations
Y_obs = pm.Bernoulli('Y_obs', p=logit, observed=y)

start = pm.find_MAP()
step = pm.NUTS(scaling=start) # Instantiate MCMC sampling algorithm
trace = pm.sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling

/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
from scan_perform.scan_perform import *

In [3]:
pm.traceplot(trace)

Out[3]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10cdc2450>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1118839d0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x11189ce10>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1119791d0>]], dtype=object)
In [14]:
np.mean(trace.get_values('beta'), axis=0)

Out[14]:
array([ 10.12441596, -12.76510469])
In [33]:
def predict(trace, x1, x2, threshold):
alpha = trace.get_values('alpha').mean()
betas = np.mean(trace.get_values('beta'), axis=0)
linear = alpha + (x1 * betas[0]) + (x2 * betas[1])
probability = 1 / (1 + np.exp(-linear))
return [np.where(probability >= threshold, 1, 0), linear, probability]
def accuracy(predictions, actual):
return np.sum(predictions == actual) / len(predictions)

In [34]:
predictions, logit_x, logit_y = predict(trace, X1, X2, 0.5)
accuracy(predictions, y)

Out[34]:
0.98999999999999999
In [36]:
plt.scatter(logit_x, logit_y)

Out[36]:
<matplotlib.collections.PathCollection at 0x112a96f50>