import json
import pandas as pd
from itertools import *
from matplotlib import cm
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib
from matplotlib import pyplot as plt
from IPython.core.pylabtools import figsize

%matplotlib inline

poetry_features = pd.read_csv('data/poetry_features.csv')
#Dropping the missing data
poetry_features = poetry_features.dropna(axis=0)
poetry_features = poetry_features.drop(['punctFreq','alliterFreq','unigramFreq'],axis=1)

from datetime import datetime
dt = [datetime.utcfromtimestamp(poetry_features.created.iloc[n]) for n,line in enumerate(poetry_features.created)]
pi = pd.PeriodIndex([pd.Period(d,'D') for d in dt])
monthly_ = pd.DataFrame(np.ones(len(pi)),index=pi).resample('M',how='sum')

ax = monthly_.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7)
ax.set_ylabel('Number of Poems')
ax.set_xlabel('Month')
ax.set_title('Number of Poems posted on the Powerpoetry website')


#Daily Period Index
poetry_features['periodindex'] = pd.PeriodIndex([pd.Period(d,'D') for d in dt])

#Group By user id and PeriodIndex. Take the Mean to take into account dumping behavior.
grouped = poetry_features.groupby(['uid','periodindex']).aggregate(np.mean).sort_index()
ut = grouped.index

#Reindex to count from date 
count_ = pd.DataFrame(grouped.index.tolist())[0].value_counts().sort_index()
count = [i+1 for user,x in count_.iteritems() for i in range(x)]

#Create indexes with user id, period and poem number to be used later in the analysis>

nix = [(u[0],u[1],c)for c,u in zip(count,ut)]
nth =  [x[2] for x in nix]
uids = [x_[0] for x_ in nix]

grouped.index = uids #Change the index to user IDs.


#Clean nonrequired columns and admin uids. 
def clean(df):
	#Clean. #Drop User == 1 and 0 and 4. These users represent admin. 
	try:
		df = df.drop(['created','nid'],axis=1)
		df = df.drop([(0,),(1,),(4,)])
	except:
		pass
	return df

grouped = clean(grouped)

grouped = (grouped - grouped.mean()) / grouped.std()
grouped.describe()

print 'Number of poems {0} on the website'.format(len(poetry_features))
print 'Number of poems {0} on the website after dumping behavior is taken into account'.format(len(grouped))


#Number of poets posted on the website,
print 'Number of poets {0} on the website'.format(len(set(poetry_features.uid)))

#Count the number of returns
return_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] > 1))),nix)).groupby(0).sum().resample('M',how='sum')
#Count the number of first timers.
#first_time_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] == 1))),nix)).groupby(0).sum().resample('M')
#Merge
traffic = pd.concat([return_traffic],axis=1)
traffic.columns = ['return']

ax = traffic.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7)
ax.set_ylabel('Number of Visitors')
ax.set_xlabel('Month')
ax.set_title('Breakdown of return visitors to Powerpoetry website')

first_time_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] == 1))),nix)).groupby(0).sum().resample('M',how='sum')
traffic = pd.concat([first_time_traffic],axis=1)
traffic.columns = ['first time']

ax = traffic.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7)
ax.set_ylabel('Number of Visitors')
ax.set_xlabel('Month')
ax.set_title('Breakdown of first time visitors to Powerpoetry website')

nthpoem = np.array([i[2] for i in nix])
bin_counts_cumulative = np.bincount(nthpoem)[1:][::-1] #Reverse Order
bin_counts = [bin_count - bin_counts_cumulative[i-1] for i, bin_count in enumerate(bin_counts_cumulative) if i > 0] #Skip the first number
bin_counts = [bin_counts_cumulative[0]] +  bin_counts #Append the first entry back
bin_counts.reverse() #Reverse
ind  = range(1,1 + len(bin_counts)) #the x Locations for the groups

for i, n in enumerate(bin_counts):
	if i < 20:
		print 'number of poets who posted {0} poem(s) : {1}'.format(i,n)
#Chart the distribution
fig, ax = plt.subplots()
ax.bar(ind,bin_counts,color=["#7A68A6"],alpha=0.7)
ax.set_ylabel('Number of poets')
ax.set_xlabel('Number of poems')
ax.set_title('Distribution of Activity of Poets')
ax.annotate('Hello most active user !', xy=(ind[-1], 1), xytext=(300, 20000),
            arrowprops=dict(facecolor='black', shrink=0.05))

def seasoned(cut,nix,grouped):
	ix = filter((lambda x: x[2] >=cut),nix) #seasoned poets with at least 10 poems posted
	unique_uids = np.unique([i[0] for i in ix])
	unique_nids = [ix_[2] for ix_ in nix if ix_[0] in unique_uids] #All the nids for seasoned
	features_ = grouped.loc[unique_uids] #Feature set for seasoned poets.
	#print len(unique_uids)
	return(features_, unique_nids, unique_uids)


def distribution_betas(seasoned_poets,unique_uids):
	'Looping through the dataframe to retrieve the scores for each user at a time and calculate the progress'
	sig = 0
	betas_user = np.zeros((len(unique_uids),seasoned_poets.shape[1])) 
	for x,uid in enumerate(unique_uids):
		scores = seasoned_poets.loc[uid]
		#Iterate over the score
		for z,(feature,score) in enumerate(scores.iteritems()):
			#Regression coefficients
			y = score.values
			X = arange(len(score))+1
			X = sm.add_constant(X)
			res = sm.OLS(y,X).fit() #OLS
			#if x < 1: #Show for the first user
			#	print feature, uid
			#	print res.summary()
			# 
			betas_user[x,z] = res.params[1]
			if res.f_pvalue<0.05: #Take the beta with a p-value threshold! 
				sig += 1
	print '{0} significant results out of {1} regressions'.format(sig,x* z)

	#Chart Distribution of Betas
	figsize(25, 10)
	fig = plt.figure()
	#Subplots within a single chart
	betas_, columns_ = betas_user.T, seasoned_poets.columns
	for k, (beta, column) in enumerate(zip(betas_,columns_)):
		    sx = plt.subplot(int(betas_.shape[0]/2+1), 2, k+1)
		    plt.rc('axes', color_cycle=['r', 'g', 'b', 'y'])
		    #plt.xlabel(seasoned_poets.columns[k])
		    #Label
		    plt.text(beta.min(), 40, 'average beta is {0}'.format(round(beta.mean(),2)), fontsize=15) 
		    plt.setp(sx.get_yticklabels(), visible=False)
		    plt.hist(beta,color=cm.jet(1.*k/len(betas_)), alpha=0.4, bins= np.linspace(beta.min(), beta.max(), 20)) #beta is the all the betas for all the users for each matric.
		    plt.ylim(0,50)
		    plt.legend([column])
		    plt.vlines(0, 0, 500, color="k", linestyles="--", lw=1)
		    #Significance test
		    p_val = stats.ttest_1samp(beta, 0)[1]
		    if p_val < 0.10: plt.text(beta.min(), 30, 'statistically significant at {0}'.format(round(p_val,3)), fontsize=15) 
		    #plt.autoscale(tight=True)

cut = 10
seasoned_poets, unique_nids,unique_uids = seasoned(cut,nix,grouped) #Returns poems from poets with more than n poems. UID is the index
distribution_betas(seasoned_poets,unique_uids)

cut = 12
seasoned_poets, unique_nids,unique_uids = seasoned(cut,nix,grouped) #Returns poems from poets with more than n poems. UID is the index
distribution_betas(seasoned_poets,unique_uids)

def abbreviate(seasoned_poets,cut):
	seasoned_poets_abbreviated = seasoned_poets.copy()
	seasoned_poets_abbreviated.index = unique_nids
	seasoned_poets_abbreviated['nid'] = unique_nids
	#Filter by cutoff point. Take only nth poems up to cut
	filter_ =  seasoned_poets_abbreviated['nid']<=cut
	seasoned_poets_abbreviated = seasoned_poets_abbreviated[filter_] #Take only the first 'cut' poems
	seasoned_poets_abbreviated = seasoned_poets_abbreviated.drop('nid',axis=1)
	return seasoned_poets_abbreviated

def average_progress_chart(seasoned_poets_abbreviated,unique_nids):
	figsize(25, 10)
	#print seasoned_poets_abbreviated, ' poets in total'
	betas_median_feature = np.zeros(seasoned_poets_abbreviated.shape[1])
	fig = plt.figure()
	subplots_adjust(hspace=0.,wspace=0.) 
	for k,(column,feature) in enumerate(seasoned_poets_abbreviated.iteritems()):
		#Plot params
		sx = plt.subplot(int(betas_median_feature.shape[0]/2+1), 2, k+1)
		plt.rc('axes', color_cycle=['r', 'g', 'b', 'y'])
		plt.setp(sx.get_yticklabels(), visible=False)
		#Median for each poem number first
		#print column
		median_feature = feature.groupby(feature.index).median()
		y = median_feature.values
		X = median_feature.index
		X = sm.add_constant(X)
		res = sm.OLS(y,X).fit()
		betas_median_feature[k] = res.params[1]
		#Plot the Progress
		plt.setp(sx.get_yticklabels(), visible=False)
		plt.text(1, y.max(), 'beta of the slope is {:10.4f} with p-value of {:10.4f}'.format(res.params[1],res.f_pvalue), fontsize=12) 
		
		#Scatter
		sx = plt.scatter(X[:,1],y, alpha=0.9,marker=(6,0))
		#Line
		x_ = np.unique(X[:,1])
		y_ = (res.params[0] + res.params[1]*np.unique(X[:,1]))
		#Fit the Line of the regression
		col = cm.jet(1.*k/len(betas_median_feature))
		plt.plot(x_,y_,color=col)	
		plt.legend([column])
		xlim(0,cut)
		plt.autoscale(tight=True)

npoems = []
cutoff = []
for i in range(8,30):
	seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped)
	seasoned_poets_abbreviated = abbreviate(seasoned_poets,i)
	cutoff.append(i)
	npoems.append(seasoned_poets_abbreviated.shape[0]/i)
fig, ax = plt.subplots()
ax.plot(cutoff,npoems)
ax.set_ylabel('Number of poets with at least minimum number of poems')
ax.set_xlabel('Number of poems minimum')

i = 12
seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped)
seasoned_poets_abbreviated = abbreviate(seasoned_poets,i)
average_progress_chart(seasoned_poets_abbreviated,unique_nids)

i = 20
seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped)
seasoned_poets_abbreviated = abbreviate(seasoned_poets,i)
average_progress_chart(seasoned_poets_abbreviated,unique_nids)

def ab_test(cut,nix,grouped):
    seasoned_poets, unique_nids, unique_uids = seasoned(cut,nix,grouped)
    seasoned_poets.index = unique_nids
    print 'Total number of {0} poets'.format(len(unique_uids))
    
    
    figsize(25, 10)
    fig = plt.figure()
    
    n = cut//2 #Take the first set of poems.
    for k,(column,feature) in enumerate(seasoned_poets.iteritems()):
        a = feature[seasoned_poets.index<=n]
        b = feature[seasoned_poets.index>n]
        b = b[b.index<=n*2] #Keep the sample sizes equal
        sx = plt.subplot(int(seasoned_poets.shape[1]/2+1), 2, k+1)
        plt.setp(sx.get_yticklabels(), visible=False)
    
        lp = np.linspace(min(min(a),min(b)),max(max(a),max(b)),30)
        plt.hist(a,bins=lp,alpha=0.5)
        plt.hist(b,bins=lp,alpha=0.5)
        plt.autoscale(tight=True)
        plt.legend([column])
    
        #two_sample = stats.ttest_ind(a, b)
        #test_stat = stats.ranksums(a, b) 
        test_stat = stats.ks_2samp(a, b)
        print 'For the feature {2}: The t-statistic is {0} and the p-value is {1}.'.format(test_stat[0],test_stat[1],column)
        
#Analyze 12 poems - First 6 vs second 6.       
ab_test(12,nix,grouped)

ab_test(6,nix,grouped)

features = pd.read_csv('data/poetry_features.csv')

features = features.drop(['punctFreq','alliterFreq','unigramFreq','Race'],axis=1)
features.index = features['nid']
features = features.drop(['nid','created','uid'],axis=1)


tracts = pd.read_csv('data/location_tracts.csv')
loc = tracts[['nid','AcsHouseholdIncomeMedian','TractCode']]
loc.index = loc['nid']
loc = loc.drop('nid',axis=1)
features_with_loc = loc.join(features)
features_with_loc = features_with_loc.dropna(axis=0)

features_with_loc['trigramFreq'] = (features_with_loc['trigramFreq'] - features_with_loc['trigramFreq'].mean()) / float(features_with_loc['trigramFreq'].std())


#Group By Tracts. #Take tracts with only at least 20 poems. 139 tracts.
tract_size = features_with_loc.groupby('TractCode').size()
tract_list = tract_size[tract_size>20]
#Subset of feature set
features_with_loc['take'] = features_with_loc['TractCode'].map(lambda x: x in tract_list)
features_with_loc_subset = features_with_loc.dropna(axis=0)
features_with_loc_subset = features_with_loc[features_with_loc['take']==True]

#Group By TractCode. Take the median. 139 Tracts in general
features_by_tract = features_with_loc_subset.groupby('TractCode').aggregate('median')
features_by_tract = features_by_tract.drop(['take'],axis=1)
features_by_tract['AcsHouseholdIncomeMedian'] = features_by_tract['AcsHouseholdIncomeMedian'].map(lambda x: int(x))


trigram_by_tract = pd.concat([features_by_tract['trigramFreq'],np.log(features_by_tract['AcsHouseholdIncomeMedian'])],axis=1)

print 'Number of regions', len(trigram_by_tract)

def average_progress_chart(data):
	figsize(20, 15)

	fig = plt.figure()
	sx = plt.subplot(1,1,1)
	plt.setp(sx.get_yticklabels(), visible=False)


	#Median for each poem number first
	#print feature
	y = data.values[:,0]
	X = data['AcsHouseholdIncomeMedian'].values
	X = sm.add_constant(X)
	res = sm.OLS(y,X).fit()
	betas_median = res.params[1]
	#Plot the Progress
	#Scatter
	plt.scatter(X[:,1],y, alpha=0.9,marker=(6,0))
	#Line
	x_ = np.unique(X[:,1])
	y_ = (res.params[0] + res.params[1]*x_)
	#Fit the Line of the regression
	plt.plot(x_,y_,color='red')	
	#plt.legend([column])
	#xlim(0,cut)
	plt.autoscale(tight=True)
	#Linear Fit
	plt.text(x_.min(), y.max(), 'beta of the slope is {:10.4f} with p-value of {:10.4f}'.format(res.params[1],res.f_pvalue), fontsize=20)
	#Labels
	sx.set_ylabel('Trigram Frequencies')
	sx.set_xlabel('Income Levels')
	sx.set_title('Trigram Frequencies by Income Levels')
	print res.summary()


average_progress_chart(trigram_by_tract)

def ab_test_income(data):

    figsize(25, 10)
    fig = plt.figure()
    
    median_income = trigram_by_tract.AcsHouseholdIncomeMedian.median()
    a = trigram_by_tract['trigramFreq'][trigram_by_tract.AcsHouseholdIncomeMedian<=median_income]
    b = trigram_by_tract['trigramFreq'][trigram_by_tract.AcsHouseholdIncomeMedian>median_income]


    lp = np.linspace(min(min(a),min(b)),max(max(a),max(b)),10)
    ax = plt.subplot(211)
    plt.hist(a,bins=lp,alpha=0.5,color="#467821")
    plt.legend(['Lower Income'])
    ax = plt.subplot(212)
    plt.hist(b,bins=lp,alpha=0.5,color="#7A68A6")
    plt.autoscale(tight=True)
    plt.legend(['Higher Income'])
    

    #two_sample = stats.ttest_ind(a, b)
    #test_stat = stats.ranksums(a, b) 
    test_stat = stats.ks_2samp(a, b)
    print 'The mean trigram frequency for poets from high-income neighborhoods is {0}'.format(a.mean())
    print 'The mean trigram frequency for poets from high-income neighborhoods is {0}'.format(b.mean())
    print 'The t-statistic is {0} and the p-value is {1}.'.format(test_stat[0],test_stat[1])

ab_test_income(trigram_by_tract)