sys.path.append('pysrc')
import decision_trees as dt
import networkx as nx
import numpy as np

inputs = np.zeros((16, 2))
outputs = []
row = 0
for x in range(4):
    for y in range(4):
        inputs[row][0] = x
        inputs[row][1] = y
        row += 1

for row in inputs:
    if (row[0] > 1 and row[1] < 2) or (row[0] < 2 and row[1] > 1):outputs.append(1)
    else: outputs.append(0)
            
clazz = [0,1]
meta = ['x','y']
tree = dt.build_tree(inputs, outputs, clazz, meta)
dt.draw_tree(tree)
data = np.zeros((16,4))
for r in range(16):
    data[r][0] = r
    data[r][1] = inputs[r][0]
    data[r][2] = inputs[r][1]
    data[r][3] = outputs[r]
#print data

sys.path.append('pysrc')
import decision_trees as dt
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt

p = 'data/SAheart.data'
f = open(p,'r')
all_lines = f.readlines()
train_cnt = int(0.75 * len(all_lines))
lines = all_lines[0:train_cnt]
col_cnt = len(lines[0].split(','))
row_cnt = len(lines)
outputs = [int(line.split(',')[col_cnt-1][0]) for line in lines]
inputs = np.zeros((row_cnt, col_cnt-1))
for row in range(row_cnt):
    line = lines[row].split(',')[0:col_cnt-1]
    inputs[row] = [float(v) for v in line] 
clazz = [0,1]

tree = dt.build_tree(inputs, outputs, clazz, meta=['sbp','tob','ldl','adip','famhist','typea','obes','alc','age'], max_rm=5)
#dt.draw_tree(tree)

#compare the tree precition to training values, since we haven't pruned this should be very accurate
diff = []
for row in range(train_cnt):
    p = dt.decide(tree, inputs[row])
    if p == outputs[row]: diff.append(0)
    else: diff.append(1)
        
misses = sum(diff)
print "In the training data, there were {0} miss classifications for {1} inputs, a rate of {2}%".format(misses, train_cnt, 100*misses/float(train_cnt))
x = range(train_cnt)
f, axarr = plt.subplots(2,1)
f.subplots_adjust(right=1.5)
f.subplots_adjust(top=1.5)

#plot training comparison
ax1 = axarr[0]
ax1.scatter(x,diff)

#compare the tree prediction to actual values not used in training set
test_lines = all_lines[train_cnt+1:len(all_lines)-1]
actual_out = [int(line.split(',')[col_cnt-1][0]) for line in test_lines]
row_cnt = len(test_lines)
test_in = np.zeros((row_cnt, col_cnt-1))
for row in range(row_cnt):
    line = test_lines[row].split(',')[0:col_cnt-1]
    test_in[row] = [float(v) for v in line]

diff = []
for row in range(len(test_in)):
    p = dt.decide(tree, test_in[row])
    if p == actual_out[row]: diff.append(0)
    else: diff.append(1)
misses = sum(diff)        
print "In the hold out data, there were {0} miss classifications for {1} inputs, a rate of {2}%".format(misses, len(test_in), 100*misses/float(len(test_in)))

x = range(len(diff))
ax2 = axarr[1]
ax2.scatter(x,diff)