Notebook

In [1]:

import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import matplotlib
plt.style.use('bmh')
matplotlib.rc('lines',linewidth=3)
matplotlib.rc('font',size=12)

In [2]:

# 3 cases: no z (None), z=y, and randomly distributed z
case = 'randomz'
case = 'z=y'
#case = None

In [3]:

# generate synthetic training data
N_training = 10000
eps = .1 # noise term

x = np.random.uniform(low=0.0, high=1.0, size=N_training)
y = np.random.uniform(low=0.0, high=1.0, size=N_training)
if case == 'randomz':
    z = np.random.uniform(low=0.0, high=1.0, size=N_training)
if case == 'z=y':
    z = y
    
noise = np.random.normal(0, eps, size=N_training)

f = 2 + x**2 + y**2 + noise # truth

# plot training data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, f)
ax.set_xlim([0,1])
ax.set_ylim([0,1])
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('f(x,y)')
plt.savefig('training_data.png', format='png')
plt.show()

In [4]:

# train forest
forest = RandomForestRegressor(n_estimators=1000, max_features=1, oob_score=True)

if case == 'randomz' or case == 'z=y':
    forest.fit(
        np.concatenate((x.reshape(N_training,1), y.reshape(N_training,1), z.reshape(N_training,1)),axis=1), 
                               f.reshape(N_training,))
elif case == None:
    forest.fit(
        np.concatenate((x.reshape(N_training,1), y.reshape(N_training,1)),axis=1),
                    f.reshape(N_training,))

# get feature importance and out of bag prediction
feature_importance = forest.feature_importances_
out_of_bag_prediction = forest.oob_prediction_

In [5]:

# visualize surface predicted by forest
n = 50
x_p = np.linspace(0, 1, n)
y_p = np.linspace(0, 1, n)

FT = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        if case == 'randomz':
            # pass in a random number for z
            FT[i,j] = forest.predict(np.array([x_p[i], y_p[j], np.random.uniform(low=0.0, high=1.0)]))
        elif case == 'z=y':
            FT[i,j] = forest.predict(np.array([x_p[i], y_p[j], y_p[j]]))
        elif case == None:
            FT[i,j] = forest.predict(np.array([x_p[i], y_p[j]]))

In [6]:

# plot contour
fig = plt.figure()
ax = fig.gca(projection='3d')
X, Y = np.meshgrid(x_p, y_p)
ax.plot_surface(X, Y, FT.T, rstride=8, cstride=8, alpha=0.3)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$\hat{f}(x,y)$')
ax.set_xlim([0,1])
ax.set_ylim([0,1])
plt.savefig('leanrnedsuface.png', format='png')
plt.show()

In [7]:

# how does out of bag prediction compare to real f?
fig = plt.figure()
H2d, xed, yed = np.histogram2d(out_of_bag_prediction, f, bins=20) # numpy 2d histogram function
Hpl = np.rot90(H2d)  # rotate, ...
Hpl = np.flipud(Hpl) # ... flip, ... 
Hpl = np.ma.masked_where(Hpl==0, Hpl) # ... and mask zero values, ...
cmap=plt.get_cmap('Blues')
plt.plot(np.linspace(np.min(f), np.max(f)), np.linspace(np.min(f), np.max(f)))
im = plt.pcolormesh(xed, yed, Hpl, cmap=cmap) # ... then make plot
plt.xlabel('Out of bag prediction by forest')
plt.ylabel('True value')
plt.title("Root mean square error: %.2f" % np.sqrt(np.mean((out_of_bag_prediction - f)**2)))
plt.savefig('oob.png', format='png')
plt.show()

In [8]:

# plot feature importance
fig, ax = plt.subplots()
if case == None:
    ind = np.arange(2)
    feature_labels = ['x', 'y']
else:
    feature_labels = ['x', 'y', 'z']
    ind = np.arange(3)
rects1 = ax.bar(ind+0.25, feature_importance, 0.5, color='g',alpha = 0.6)
ax.set_xticks(ind+0.5/2+0.25)
ax.set_xticklabels(feature_labels)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.ylabel('Feature importance')
plt.title('Feature importance')
plt.tight_layout()
plt.savefig('featureimportance.png',format='png')
plt.show()

In [8]: