import pandas as pd
from pandas import DataFrame 
df_shroom = DataFrame.from_csv('../datasets/mushroom_data.csv')
df_shroom

def entropy(probs):
    '''
    Takes a list of probabilities and calculates their entropy
    '''
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )
    

def entropy_of_list(a_list):
    '''
    Takes a list of items with discrete values (e.g., poisonous, edible)
    and returns the entropy for those items.
    '''
    from collections import Counter
    
    # Tally Up:
    cnt = Counter(x for x in a_list)
    
    # Convert to Proportion
    num_instances = len(a_list)*1.0
    probs = [x / num_instances for x in cnt.values()]
    
    # Calculate Entropy:
    return entropy(probs)
    
# The initial entropy of the poisonous/not attribute for our dataset.
total_entropy = entropy_of_list(df_shroom['class'])
print total_entropy

def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    '''
    Takes a DataFrame of attributes, and quantifies the entropy of a target
    attribute after performing a split along the values of another attribute.
    '''
    
    # Split Data by Possible Vals of Attribute:
    df_split = df.groupby(split_attribute_name)
    
    # Calculate Entropy for Target Attribute, as well as Proportion of Obs in Each Data-Split
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    if trace: # helps understand what fxn is doing:
        print df_agg_ent
    
    # Calculate Information Gain:
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy-new_entropy

print '\nExample: Info-gain for best attribute is ' + str( information_gain(df_shroom, 'odor', 'class') )

def id3(df, target_attribute_name, attribute_names, default_class=None):
    
    ## Tally target attribute:
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    
    ## First check: Is this split of the dataset homogeneous?
    # (e.g., all mushrooms in this set are poisonous)
    # if yes, return that homogenous label (e.g., 'poisonous')
    if len(cnt) == 1:
        return cnt.keys()[0]
    
    ## Second check: Is this split of the dataset empty?
    # if yes, return a default value
    elif df.empty or (not attribute_names):
        return default_class 
    
    ## Otherwise: This dataset is ready to be divvied up!
    else:
        # Get Default Value for next recursive call of this function:
        index_of_max = cnt.values().index(max(cnt.values())) 
        default_class = cnt.keys()[index_of_max] # most common value of target attribute in dataset
        
        # Choose Best Attribute to split on:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max]
        
        # Create an empty tree, to be populated in a moment
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        # Split dataset
        # On each split, recursively call this algorithm.
        # populate the empty tree with subtrees, which
        # are the result of the recursive call
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

# Get Predictor Names (all but 'class')
attribute_names = list(df_shroom.columns)
attribute_names.remove('class')

# Run Algorithm:
from pprint import pprint
tree = id3(df_shroom, 'class', attribute_names)
pprint(tree)

def classify(instance, tree, default=None):
    attribute = tree.keys()[0]
    if instance[attribute] in tree[attribute].keys():
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

df_shroom['predicted'] = df_shroom.apply(classify, axis=1, args=(tree,'poisonous') ) 
    # classify func allows for a default arg: when tree doesn't have answer for a particular
    # combitation of attribute-values, we can use 'poisonous' as the default guess (better safe than sorry!)

print 'Accuracy is ' + str( sum(df_shroom['class']==df_shroom['predicted'] ) / (1.0*len(df_shroom.index)) )

df_shroom[['class', 'predicted']]

training_data = df_shroom.iloc[1:-1000] # all but last thousand instances
test_data  = df_shroom.iloc[-1000:] # just the last thousand
train_tree = id3(training_data, 'class', attribute_names)

test_data['predicted2'] = test_data.apply(                                # <---- test_data source
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'poisonous') ) # <---- train_data tree

print 'Accuracy is ' + str( sum(test_data['class']==test_data['predicted2'] ) / (1.0*len(test_data.index)) )