import pandas as pd
import pylab as P
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from nltk import FreqDist
%matplotlib inline 

# read in file as pandas dataframe
def read_in_file(filename):
    data = pd.read_csv(filename)
    return data

# for sanity checks of data within columns
#  prints out the unique categorical values found in specified columns
def get_unique_values(df, verbose=False):
    column_names = list(df)
    agg_levels = pd.unique(df[column_names[0]])
    names = pd.unique(df[column_names[2]])
    discip_type = pd.unique(df[column_names[3]])
    ethnic_groups = pd.unique(df[column_names[4]])
    if verbose:
        print("Aggregate Levels are: %s" % agg_levels)
        print("Names of schools are: %s" % names)
        print("Discipline types are: %s" % discip_type)
        print("Ethnic groups are: %s" % ethnic_groups)
    return

#  subset the data by AGGREGATION LEVEL
#  D=Local educational agency totals (includes districts and direct funded charter schools)
#  O=County totals
#  S=SchoolTotals
#  T=State totals
def subset_by_agg_level(df):
    column_names = list(df)
    county_data = df[df.AggegateLevel == 'O']
    district_data = df[df.AggegateLevel == 'D']
    school_data = df[df.AggegateLevel == 'S']
    state_data = df[df.AggegateLevel == 'T']
    # label columns in subset:
    county_data.columns = column_names
    district_data.columns = column_names
    school_data.columns = column_names
    state_data.columns = column_names
    return county_data, district_data, school_data, state_data

#  export a specified dataframe to csv
def export_df_to_csv(df, output_filename):
    df.to_csv(output_filename, index=False, header=True)
    return

# grab a subset of a dataframe by column indices
def subset_cols_dataframe(df, col_indexes):
    subset = df.iloc[:, col_indexes]
    return subset

# append together an array of dataframes(any number) in order
# example: array_df = [dataframe1, dataframe2, dataframe3]
def append_dataframes(array_df):
    temp = array_df[0]
    for i in range(1, len(array_df)):
        temp = temp.append(array_df[i])
    return temp

#  this chunk of code just takes the input csv and subsets it
ca14 = read_in_file('CA_discip14.csv')  # 2014 California Data saved in ca14
county_data14, district_data14, school_data14, state_data14 = subset_by_agg_level(ca14)
ca13 = read_in_file('CA_discip13.csv')  # 2013 California Data saved in ca13
county_data13, district_data13, school_data13, state_data13 = subset_by_agg_level(ca13)
ca12 = read_in_file('CA_discip12.csv')  # 2012 California Data saved in ca12
county_data12, district_data12, school_data12, state_data12 = subset_by_agg_level(ca12)

print("2012 data:")
get_unique_values(ca12, verbose=True)
print("2013 data:")
get_unique_values(ca13, verbose=True)
print("2014 data:")
get_unique_values(ca14, verbose=True)
print("-----------------------------------------------------------------------------------------")

# creating the useful dataframe for visualizing change in schools over time
indices_schools = [2, -2,-1]  # for this information we only need the name of school, total referrals, and year

schools14 = subset_cols_dataframe(school_data14, indices_schools)  # slicing out columns for each year
schools13 = subset_cols_dataframe(school_data13, indices_schools)
schools12 = subset_cols_dataframe(school_data12, indices_schools)

school_totals = append_dataframes([schools14, schools13, schools12])  # appending all years together

by_school = school_totals.groupby(['Name','Year']).sum()
print("Preview of groupby product: ")
print(by_school[0:10])

print(by_school.describe())

by_school.sort('Total', ascending=False).head(50)

total_ref = by_school.Total.tolist() # get just the number total referrals/year in a list
cleanedList = [x for x in total_ref if str(x) != 'nan']  # temp fix since my 'nan' is string not NaN

# make histogram from this list of values
yearly_ref = Counter()
for value in cleanedList:
    yearly_ref[value] += 1
    
yearly_ref_values = list(yearly_ref.keys())

print(yearly_ref.most_common(15))

P.hist(yearly_ref_values, bins=100, facecolor='green')
plt.title('Counts of Number of Referrals per Year')
plt.ylabel('Number of Schools')
plt.xlabel('Number Referrals')


#  choose which columns you want to subset
indices_ethnicity = [4, 6, 12]

# state data w/o agg_level and CID and name
indices_IDless = [4, 3, 5, 6, 7, 8, 9, 10, 12]
ethnic14state = subset_cols_dataframe(state_data14, indices_IDless)
ethnic13state = subset_cols_dataframe(state_data13, indices_IDless)
ethnic12state = subset_cols_dataframe(state_data12, indices_IDless)

# append the state data from all 3 years together
ethnic_discip_state = append_dataframes([ethnic14state, ethnic13state, ethnic12state])

# optional: uncomment to export this dataframe in csv form
# export_df_to_csv(ethnic_discip_state, 'ethnic_discip_state.csv')

group_year_ethnicity = ethnic_discip_state.groupby(['Ethnicity', 'Year']).sum()
group_ethnic = ethnic_discip_state.groupby(['Ethnicity']).sum()
group_year = ethnic_discip_state.groupby(['Year']).sum()

# optional: uncomment to export this dataframe in csv form
# export_df_to_csv(group_year_ethnicity, 'group_year_ethnicity.csv')

group_year_ethnicity.plot(kind = 'bar',stacked = True, title = 'Referrals by Ethnicity and Year')

#InAmGroup = group_year_ethnicity.groupby(group_year_ethnicity.Ethnicity==1)


group_year_ethnicity.describe().plot(kind = 'area', stacked =True, title = '')


group_year_ethnicity.describe()

ethnic_discip = read_in_file('ethnic_discip_state.csv')
print("Columns are: %s" %list(ethnic_discip)) # columns of dataframe
print("Number of entries is: %s" %len(ethnic_discip)) # num entries

ethnic_discip.plot(x='Ethnicity',y ='Drugs', kind = 'scatter',xticks=[0,1,2,3,4,5,6,7,8,9], title = 'Total Referrals by Ethnicity')

byInAm = ethnic_discip[ethnic_discip.Ethnicity==1]
print(byInAm)

year = byInAm.groupby(['Year']).sum()
print(year)