#!/usr/bin/env python # coding: utf-8 # # ロジステック回帰 - pymc3 # In[8]: from __future__ import division import os import sys import glob import matplotlib.pyplot as plt import numpy as np import pandas as pd get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('precision', '4') #plt.style.use('ggplot') import seaborn as sns sns.set_style('white') sns.set_context('paper') np.random.seed(1234) import pymc3 as pm import scipy.stats as stats import logging _logger = logging.getLogger("theano.gof.compilelock") _logger.setLevel(logging.ERROR) # ## ロジステック回帰による女性・男性決定問題 # # - 身長,体重データを利用して女性・男性を決定する問題を扱う. # - pymc3のexample/dataフォルダにあるデータを利用する. # - データはpandasのデータフレームとして渡す(データの単位は不明….foot/poundでない) # In[9]: # observed data df = pd.read_csv('data/HtWt.csv') df.head() # In[10]: get_ipython().run_line_magic('pinfo', 'pm.glm.glm') # In[24]: niter = 1000 with pm.Model() as model: pm.glm.glm('male ~ height + weight', df, family=pm.glm.families.Binomial()) trace = pm.sample(niter, step=pm.Slice(), random_seed=123, progressbar=True) pm.traceplot(trace) # In[25]: # note that height and weigth in trace refer to the coefficients df_trace = pm.trace_to_dataframe(trace) pd.scatter_matrix(df_trace[-1000:], diagonal='kde'); # In[26]: plt.figure(figsize=(12, 4)) plt.subplot(121) plt.plot(df_trace.ix[-1000:, 'height'], linewidth=0.7) plt.subplot(122) plt.plot(df_trace.ix[-1000:, 'weight'], linewidth=0.7); # In[14]: pm.summary(trace); # In[27]: #import seaborn as sn sns.kdeplot(trace['weight'], trace['height']) plt.xlabel('Weight', fontsize=20) plt.ylabel('Height', fontsize=20) # In[16]: intercept, height, p, p_logodds, weight = df_trace[-niter//2:].mean(0) def predict(w, h, height=height, weight=weight): """Predict gender given weight (w) and height (h) values.""" v = intercept + height*h + weight*w return np.exp(v)/(1+np.exp(v)) # calculate predictions on grid xs = np.linspace(df.weight.min(), df.weight.max(), 100) ys = np.linspace(df.height.min(), df.height.max(), 100) X, Y = np.meshgrid(xs, ys) Z = predict(X, Y) plt.figure(figsize=(6,6)) # plot 0.5 contour line - classify as male if above this line plt.contour(X, Y, Z, levels=[0.5]) # classify all subjects colors = ['lime' if i else 'yellow' for i in df.male] ps = predict(df.weight, df.height) errs = ((ps < 0.5) & df.male) |((ps >= 0.5) & (1-df.male)) plt.scatter(df.weight[errs], df.height[errs], facecolors='red', s=150) plt.scatter(df.weight, df.height, facecolors=colors, edgecolors='k', s=50, alpha=1); plt.xlabel('Weight', fontsize=16) plt.ylabel('Height', fontsize=16) plt.title('Gender classification by weight and height', fontsize=16) plt.tight_layout();