import json import pandas as pd from itertools import * from matplotlib import cm import numpy as np from scipy import stats import statsmodels.api as sm import matplotlib from matplotlib import pyplot as plt from IPython.core.pylabtools import figsize %matplotlib inline poetry_features = pd.read_csv('data/poetry_features.csv') #Dropping the missing data poetry_features = poetry_features.dropna(axis=0) poetry_features = poetry_features.drop(['punctFreq','alliterFreq','unigramFreq'],axis=1) from datetime import datetime dt = [datetime.utcfromtimestamp(poetry_features.created.iloc[n]) for n,line in enumerate(poetry_features.created)] pi = pd.PeriodIndex([pd.Period(d,'D') for d in dt]) monthly_ = pd.DataFrame(np.ones(len(pi)),index=pi).resample('M',how='sum') ax = monthly_.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7) ax.set_ylabel('Number of Poems') ax.set_xlabel('Month') ax.set_title('Number of Poems posted on the Powerpoetry website') #Daily Period Index poetry_features['periodindex'] = pd.PeriodIndex([pd.Period(d,'D') for d in dt]) #Group By user id and PeriodIndex. Take the Mean to take into account dumping behavior. grouped = poetry_features.groupby(['uid','periodindex']).aggregate(np.mean).sort_index() ut = grouped.index #Reindex to count from date count_ = pd.DataFrame(grouped.index.tolist())[0].value_counts().sort_index() count = [i+1 for user,x in count_.iteritems() for i in range(x)] #Create indexes with user id, period and poem number to be used later in the analysis> nix = [(u[0],u[1],c)for c,u in zip(count,ut)] nth = [x[2] for x in nix] uids = [x_[0] for x_ in nix] grouped.index = uids #Change the index to user IDs. #Clean nonrequired columns and admin uids. def clean(df): #Clean. #Drop User == 1 and 0 and 4. These users represent admin. try: df = df.drop(['created','nid'],axis=1) df = df.drop([(0,),(1,),(4,)]) except: pass return df grouped = clean(grouped) grouped = (grouped - grouped.mean()) / grouped.std() grouped.describe() print 'Number of poems {0} on the website'.format(len(poetry_features)) print 'Number of poems {0} on the website after dumping behavior is taken into account'.format(len(grouped)) #Number of poets posted on the website, print 'Number of poets {0} on the website'.format(len(set(poetry_features.uid))) #Count the number of returns return_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] > 1))),nix)).groupby(0).sum().resample('M',how='sum') #Count the number of first timers. #first_time_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] == 1))),nix)).groupby(0).sum().resample('M') #Merge traffic = pd.concat([return_traffic],axis=1) traffic.columns = ['return'] ax = traffic.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7) ax.set_ylabel('Number of Visitors') ax.set_xlabel('Month') ax.set_title('Breakdown of return visitors to Powerpoetry website') first_time_traffic = pd.DataFrame(map((lambda x: (x[1], int(x[2] == 1))),nix)).groupby(0).sum().resample('M',how='sum') traffic = pd.concat([first_time_traffic],axis=1) traffic.columns = ['first time'] ax = traffic.plot(kind='bar',color=["#7A68A6"],label='test',alpha=0.7) ax.set_ylabel('Number of Visitors') ax.set_xlabel('Month') ax.set_title('Breakdown of first time visitors to Powerpoetry website') nthpoem = np.array([i[2] for i in nix]) bin_counts_cumulative = np.bincount(nthpoem)[1:][::-1] #Reverse Order bin_counts = [bin_count - bin_counts_cumulative[i-1] for i, bin_count in enumerate(bin_counts_cumulative) if i > 0] #Skip the first number bin_counts = [bin_counts_cumulative[0]] + bin_counts #Append the first entry back bin_counts.reverse() #Reverse ind = range(1,1 + len(bin_counts)) #the x Locations for the groups for i, n in enumerate(bin_counts): if i < 20: print 'number of poets who posted {0} poem(s) : {1}'.format(i,n) #Chart the distribution fig, ax = plt.subplots() ax.bar(ind,bin_counts,color=["#7A68A6"],alpha=0.7) ax.set_ylabel('Number of poets') ax.set_xlabel('Number of poems') ax.set_title('Distribution of Activity of Poets') ax.annotate('Hello most active user !', xy=(ind[-1], 1), xytext=(300, 20000), arrowprops=dict(facecolor='black', shrink=0.05)) def seasoned(cut,nix,grouped): ix = filter((lambda x: x[2] >=cut),nix) #seasoned poets with at least 10 poems posted unique_uids = np.unique([i[0] for i in ix]) unique_nids = [ix_[2] for ix_ in nix if ix_[0] in unique_uids] #All the nids for seasoned features_ = grouped.loc[unique_uids] #Feature set for seasoned poets. #print len(unique_uids) return(features_, unique_nids, unique_uids) def distribution_betas(seasoned_poets,unique_uids): 'Looping through the dataframe to retrieve the scores for each user at a time and calculate the progress' sig = 0 betas_user = np.zeros((len(unique_uids),seasoned_poets.shape[1])) for x,uid in enumerate(unique_uids): scores = seasoned_poets.loc[uid] #Iterate over the score for z,(feature,score) in enumerate(scores.iteritems()): #Regression coefficients y = score.values X = arange(len(score))+1 X = sm.add_constant(X) res = sm.OLS(y,X).fit() #OLS #if x < 1: #Show for the first user # print feature, uid # print res.summary() # betas_user[x,z] = res.params[1] if res.f_pvalue<0.05: #Take the beta with a p-value threshold! sig += 1 print '{0} significant results out of {1} regressions'.format(sig,x* z) #Chart Distribution of Betas figsize(25, 10) fig = plt.figure() #Subplots within a single chart betas_, columns_ = betas_user.T, seasoned_poets.columns for k, (beta, column) in enumerate(zip(betas_,columns_)): sx = plt.subplot(int(betas_.shape[0]/2+1), 2, k+1) plt.rc('axes', color_cycle=['r', 'g', 'b', 'y']) #plt.xlabel(seasoned_poets.columns[k]) #Label plt.text(beta.min(), 40, 'average beta is {0}'.format(round(beta.mean(),2)), fontsize=15) plt.setp(sx.get_yticklabels(), visible=False) plt.hist(beta,color=cm.jet(1.*k/len(betas_)), alpha=0.4, bins= np.linspace(beta.min(), beta.max(), 20)) #beta is the all the betas for all the users for each matric. plt.ylim(0,50) plt.legend([column]) plt.vlines(0, 0, 500, color="k", linestyles="--", lw=1) #Significance test p_val = stats.ttest_1samp(beta, 0)[1] if p_val < 0.10: plt.text(beta.min(), 30, 'statistically significant at {0}'.format(round(p_val,3)), fontsize=15) #plt.autoscale(tight=True) cut = 10 seasoned_poets, unique_nids,unique_uids = seasoned(cut,nix,grouped) #Returns poems from poets with more than n poems. UID is the index distribution_betas(seasoned_poets,unique_uids) cut = 12 seasoned_poets, unique_nids,unique_uids = seasoned(cut,nix,grouped) #Returns poems from poets with more than n poems. UID is the index distribution_betas(seasoned_poets,unique_uids) def abbreviate(seasoned_poets,cut): seasoned_poets_abbreviated = seasoned_poets.copy() seasoned_poets_abbreviated.index = unique_nids seasoned_poets_abbreviated['nid'] = unique_nids #Filter by cutoff point. Take only nth poems up to cut filter_ = seasoned_poets_abbreviated['nid']<=cut seasoned_poets_abbreviated = seasoned_poets_abbreviated[filter_] #Take only the first 'cut' poems seasoned_poets_abbreviated = seasoned_poets_abbreviated.drop('nid',axis=1) return seasoned_poets_abbreviated def average_progress_chart(seasoned_poets_abbreviated,unique_nids): figsize(25, 10) #print seasoned_poets_abbreviated, ' poets in total' betas_median_feature = np.zeros(seasoned_poets_abbreviated.shape[1]) fig = plt.figure() subplots_adjust(hspace=0.,wspace=0.) for k,(column,feature) in enumerate(seasoned_poets_abbreviated.iteritems()): #Plot params sx = plt.subplot(int(betas_median_feature.shape[0]/2+1), 2, k+1) plt.rc('axes', color_cycle=['r', 'g', 'b', 'y']) plt.setp(sx.get_yticklabels(), visible=False) #Median for each poem number first #print column median_feature = feature.groupby(feature.index).median() y = median_feature.values X = median_feature.index X = sm.add_constant(X) res = sm.OLS(y,X).fit() betas_median_feature[k] = res.params[1] #Plot the Progress plt.setp(sx.get_yticklabels(), visible=False) plt.text(1, y.max(), 'beta of the slope is {:10.4f} with p-value of {:10.4f}'.format(res.params[1],res.f_pvalue), fontsize=12) #Scatter sx = plt.scatter(X[:,1],y, alpha=0.9,marker=(6,0)) #Line x_ = np.unique(X[:,1]) y_ = (res.params[0] + res.params[1]*np.unique(X[:,1])) #Fit the Line of the regression col = cm.jet(1.*k/len(betas_median_feature)) plt.plot(x_,y_,color=col) plt.legend([column]) xlim(0,cut) plt.autoscale(tight=True) npoems = [] cutoff = [] for i in range(8,30): seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped) seasoned_poets_abbreviated = abbreviate(seasoned_poets,i) cutoff.append(i) npoems.append(seasoned_poets_abbreviated.shape[0]/i) fig, ax = plt.subplots() ax.plot(cutoff,npoems) ax.set_ylabel('Number of poets with at least minimum number of poems') ax.set_xlabel('Number of poems minimum') i = 12 seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped) seasoned_poets_abbreviated = abbreviate(seasoned_poets,i) average_progress_chart(seasoned_poets_abbreviated,unique_nids) i = 20 seasoned_poets, unique_nids,unique_uids = seasoned(i,nix,grouped) seasoned_poets_abbreviated = abbreviate(seasoned_poets,i) average_progress_chart(seasoned_poets_abbreviated,unique_nids) def ab_test(cut,nix,grouped): seasoned_poets, unique_nids, unique_uids = seasoned(cut,nix,grouped) seasoned_poets.index = unique_nids print 'Total number of {0} poets'.format(len(unique_uids)) figsize(25, 10) fig = plt.figure() n = cut//2 #Take the first set of poems. for k,(column,feature) in enumerate(seasoned_poets.iteritems()): a = feature[seasoned_poets.index<=n] b = feature[seasoned_poets.index>n] b = b[b.index<=n*2] #Keep the sample sizes equal sx = plt.subplot(int(seasoned_poets.shape[1]/2+1), 2, k+1) plt.setp(sx.get_yticklabels(), visible=False) lp = np.linspace(min(min(a),min(b)),max(max(a),max(b)),30) plt.hist(a,bins=lp,alpha=0.5) plt.hist(b,bins=lp,alpha=0.5) plt.autoscale(tight=True) plt.legend([column]) #two_sample = stats.ttest_ind(a, b) #test_stat = stats.ranksums(a, b) test_stat = stats.ks_2samp(a, b) print 'For the feature {2}: The t-statistic is {0} and the p-value is {1}.'.format(test_stat[0],test_stat[1],column) #Analyze 12 poems - First 6 vs second 6. ab_test(12,nix,grouped) ab_test(6,nix,grouped) features = pd.read_csv('data/poetry_features.csv') features = features.drop(['punctFreq','alliterFreq','unigramFreq','Race'],axis=1) features.index = features['nid'] features = features.drop(['nid','created','uid'],axis=1) tracts = pd.read_csv('data/location_tracts.csv') loc = tracts[['nid','AcsHouseholdIncomeMedian','TractCode']] loc.index = loc['nid'] loc = loc.drop('nid',axis=1) features_with_loc = loc.join(features) features_with_loc = features_with_loc.dropna(axis=0) features_with_loc['trigramFreq'] = (features_with_loc['trigramFreq'] - features_with_loc['trigramFreq'].mean()) / float(features_with_loc['trigramFreq'].std()) #Group By Tracts. #Take tracts with only at least 20 poems. 139 tracts. tract_size = features_with_loc.groupby('TractCode').size() tract_list = tract_size[tract_size>20] #Subset of feature set features_with_loc['take'] = features_with_loc['TractCode'].map(lambda x: x in tract_list) features_with_loc_subset = features_with_loc.dropna(axis=0) features_with_loc_subset = features_with_loc[features_with_loc['take']==True] #Group By TractCode. Take the median. 139 Tracts in general features_by_tract = features_with_loc_subset.groupby('TractCode').aggregate('median') features_by_tract = features_by_tract.drop(['take'],axis=1) features_by_tract['AcsHouseholdIncomeMedian'] = features_by_tract['AcsHouseholdIncomeMedian'].map(lambda x: int(x)) trigram_by_tract = pd.concat([features_by_tract['trigramFreq'],np.log(features_by_tract['AcsHouseholdIncomeMedian'])],axis=1) print 'Number of regions', len(trigram_by_tract) def average_progress_chart(data): figsize(20, 15) fig = plt.figure() sx = plt.subplot(1,1,1) plt.setp(sx.get_yticklabels(), visible=False) #Median for each poem number first #print feature y = data.values[:,0] X = data['AcsHouseholdIncomeMedian'].values X = sm.add_constant(X) res = sm.OLS(y,X).fit() betas_median = res.params[1] #Plot the Progress #Scatter plt.scatter(X[:,1],y, alpha=0.9,marker=(6,0)) #Line x_ = np.unique(X[:,1]) y_ = (res.params[0] + res.params[1]*x_) #Fit the Line of the regression plt.plot(x_,y_,color='red') #plt.legend([column]) #xlim(0,cut) plt.autoscale(tight=True) #Linear Fit plt.text(x_.min(), y.max(), 'beta of the slope is {:10.4f} with p-value of {:10.4f}'.format(res.params[1],res.f_pvalue), fontsize=20) #Labels sx.set_ylabel('Trigram Frequencies') sx.set_xlabel('Income Levels') sx.set_title('Trigram Frequencies by Income Levels') print res.summary() average_progress_chart(trigram_by_tract) def ab_test_income(data): figsize(25, 10) fig = plt.figure() median_income = trigram_by_tract.AcsHouseholdIncomeMedian.median() a = trigram_by_tract['trigramFreq'][trigram_by_tract.AcsHouseholdIncomeMedian<=median_income] b = trigram_by_tract['trigramFreq'][trigram_by_tract.AcsHouseholdIncomeMedian>median_income] lp = np.linspace(min(min(a),min(b)),max(max(a),max(b)),10) ax = plt.subplot(211) plt.hist(a,bins=lp,alpha=0.5,color="#467821") plt.legend(['Lower Income']) ax = plt.subplot(212) plt.hist(b,bins=lp,alpha=0.5,color="#7A68A6") plt.autoscale(tight=True) plt.legend(['Higher Income']) #two_sample = stats.ttest_ind(a, b) #test_stat = stats.ranksums(a, b) test_stat = stats.ks_2samp(a, b) print 'The mean trigram frequency for poets from high-income neighborhoods is {0}'.format(a.mean()) print 'The mean trigram frequency for poets from high-income neighborhoods is {0}'.format(b.mean()) print 'The t-statistic is {0} and the p-value is {1}.'.format(test_stat[0],test_stat[1]) ab_test_income(trigram_by_tract)