import numpy as np import matplotlib.pyplot as plt from datetime import datetime from __future__ import division import pandas as pd import sys print "my python version: ", sys.version_info[:] print "my pandas version: ", pd.__version__ print "my numpy version: ", np.version.version plt.rcParams['figure.figsize'] = (15, 10) # set default fig size click_file = './data/yoochoose-data/yoochoose-clicks.dat' buy_file = './data/yoochoose-data/yoochoose-buys.dat' click = pd.read_csv(click_file, header=None, index_col=False, parse_dates=[1], names=['SesID', 'TimeStamp','ItemID','Category'], dtype={'SesID':str, 'TimeStamp':datetime, 'ItemID':str, 'Category': str}) buy = pd.read_csv(buy_file, header=None,index_col=False, parse_dates=[1], names=['SesID', 'TimeStamp', 'ItemID', 'Price', 'Quantity'], dtype={'SesID':str, 'TimeStamp':datetime, 'ItemID':str, 'Price':float, 'Quantity':int}) click.tail() buy.tail() total = pd.concat([click[['TimeStamp','SesID','ItemID']],buy[['TimeStamp','SesID','ItemID']]], ignore_index=True) item2category = click[['ItemID','Category']].copy() item2category.drop_duplicates(inplace=True) print 'length of item2category :',len(item2category) print 'total num of items: ', len(total['ItemID'].drop_duplicates()) # len(item2category) >= len(total['ItemID'].drop_duplicates()) # this means some item belongs to multiple category, # about 2categories per item items_group = item2category.groupby('ItemID') category_group = item2category.groupby('Category') items_group_count = items_group.count() category_group_count = category_group.count() buy_sessions = buy['SesID'].drop_duplicates() total['IsBuy'] = total['SesID'].isin(buy_sessions) total_sesID_group = total.groupby('SesID') click_sesID_group = total[total['IsBuy'] == False].groupby('SesID') buy_sesID_group = total[total['IsBuy'] == True].groupby('SesID') n_sessions = len(total_sesID_group) n_click_sessions = len(click_sesID_group) n_buy_sessions = len(buy_sesID_group) total_records = len(total) n_clicks = (total['IsBuy'] == False).sum() n_buys = (total['IsBuy'] == True).sum() print "number of records: in click sessions:", n_clicks print "number of records in buy sessions:", n_buys print "total records: ", total_records print 'n_click_sessions: ', n_click_sessions print "total sessions: ", n_sessions print 'n_buy_sessions: ', n_buy_sessions print "avg clicks per_session: ", n_clicks / n_click_sessions print "avg clicks(including buys) per_session: ", n_buys / n_buy_sessions total_sesID_group_count = total_sesID_group['TimeStamp'].count() click_sesID_group_count = click_sesID_group['TimeStamp'].count() buy_sesID_group_count = buy_sesID_group['TimeStamp'].count() max_count = total_sesID_group_count.max(axis=0) total_sesID_group_count.plot(kind='hist', bins=np.arange(max_count)) click_bins, _ = np.histogram(click_sesID_group_count.values , bins=np.arange(1, 25)) buy_bins, _ = np.histogram(buy_sesID_group_count.values , bins=np.arange(1, 25)) # scale to [0, 1] click_bins = click_bins / click_bins.sum() buy_bins = buy_bins / buy_bins.sum() plt.bar(np.arange(1, 24)-0.4, click_bins, 0.4, color='b', label='click') plt.bar(np.arange(1, 24), buy_bins, 0.4, color='g', label='buy') plt.legend() plt.xticks(np.arange(1, 24),np.arange(1, 24)) sessions_with_one_record = total_sesID_group.first() buy_sessions_with_one_record = buy_sesID_group.first() sessions_with_month = sessions_with_one_record['TimeStamp'].apply(lambda x: x.strftime('%Y-%m')) sessions_with_month_count = sessions_with_month.value_counts().sort_index() buy_sessions_with_month = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.strftime('%Y-%m')) buy_sessions_with_month_count = buy_sessions_with_month.value_counts().sort_index() buy_sessons_monthly_ratio = buy_sessions_with_month_count /sessions_with_month_count ax = sessions_with_month_count.plot(kind='bar', alpha=0.5, color='b', label='click sessions') buy_sessions_with_month_count.plot(kind='bar', ax=ax, alpha=0.5, color='y', label='buy sessions') plt.legend(loc=2) ax1 = ax.twinx() buy_sessons_monthly_ratio.plot(ax=ax1, color='r',marker='o',markersize=5, label='buy/click ratio') ax1.set_ylim([0, 0.07]) plt.legend() print 'buy_sessons_monthly_ration:\n', buy_sessons_monthly_ratio.values print 'mean: ', buy_sessons_monthly_ratio.values.mean(), ' std: ', buy_sessons_monthly_ratio.values.std() sessions_with_week = sessions_with_one_record['TimeStamp'].apply(lambda x: x.timetuple().tm_yday // 7) sessions_with_week_count = sessions_with_week.value_counts().sort_index() buy_sessions_with_week = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.timetuple().tm_yday // 7) buy_sessions_with_week_count = buy_sessions_with_week.value_counts().sort_index() sessions_with_week_count.index buy_sessions_week_ratio = buy_sessions_with_week_count /sessions_with_week_count fig, ax1 = plt.subplots() ax1.bar(sessions_with_week_count.index -0.4, sessions_with_week_count.values, 0.8, alpha=0.5, color='b', label='click sessions') ax1.bar(buy_sessions_with_week_count.index -0.4, buy_sessions_with_week_count.values, 0.8, alpha=0.5, color='y', label='buy sessions') plt.legend(loc=2) ax1 = ax1.twinx() ax1.plot(sessions_with_week_count.index, buy_sessions_week_ratio.values, color='r',marker='o',markersize=5, label='buy/click ratio') ax1.set_ylim([0, 0.07]) mean_ration = buy_sessons_week_ratio.values.mean() plt.axhline(mean_ration,color='r', linestyle='dotted', label='avg ratio:{:.3f}'.format(mean_ration)) plt.legend(loc=1) print 'buy_sessons_week_ration:\n', buy_sessons_week_ratio.values print 'mean: ', mean_ration, ' std: ', buy_sessons_week_ratio.values.std() sessions_with_day = sessions_with_one_record['TimeStamp'].apply(lambda x: x.date()) sessions_with_day_count = sessions_with_day.value_counts().sort_index() buy_sessions_with_day = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.date()) buy_sessions_with_day_count = buy_sessions_with_day.value_counts().sort_index() plt.title("[0,max] scaled sessions_per_day statistics") plt.plot(sessions_with_day_count.values / max(sessions_with_day_count.values), label="all_sessions_per_day") plt.plot(buy_sessions_with_day_count.values / max(buy_sessions_with_day_count.values), label="buy_sessions_per_day") plt.plot(buy_sessions_with_day_count.values / sessions_with_day_count.values, label="ratio") plt.vlines(np.arange(0, len(sessions_with_day_count), 7), [0], [1], linestyles='dotted', label='Tuesday') plt.legend(loc=2) plt.xlim([0,len(sessions_with_day_count)]) sessions_with_weekday = sessions_with_one_record['TimeStamp'].dt.weekday sessions_with_weekday_count = sessions_with_weekday.value_counts().sort_index() buy_sessions_with_weekday = buy_sessions_with_one_record['TimeStamp'].dt.weekday buy_sessions_with_weekday_count = buy_sessions_with_weekday.value_counts().sort_index() buy_sessions_weekday_ratio = buy_sessions_with_weekday_count / sessions_with_weekday_count fig, ax1 = plt.subplots() ax1.bar(np.arange(7)-0.4, sessions_with_weekday_count.values, 0.8, alpha=0.5, color='b', label='click sessions') ax1.bar(np.arange(7)-0.4, buy_sessions_with_weekday_count.values, 0.8, alpha=0.5, color='y', label='buy sessions') plt.xticks(np.arange(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) plt.legend(loc=2) ax1.set_ylim([0, 2200000]) ax2 = ax1.twinx() ax2.plot(np.arange(7), buy_sessions_weekday_ratio.values, color='r',marker='o',markersize=5, label='buy/click ratio') ax2.set_ylim([0, 0.07]) plt.legend(loc=1) print 'buy_sessons_weekday_ration:\n', buy_sessons_weekday_ratio.values sessions_with_hour = sessions_with_one_record['TimeStamp'].dt.hour sessions_with_hour_count = sessions_with_hour.value_counts().sort_index() buy_sessions_with_hour = buy_sessions_with_one_record['TimeStamp'].dt.hour buy_sessions_with_hour_count = buy_sessions_with_hour.value_counts().sort_index() buy_sessons_hour_ratio = buy_sessions_with_hour_count /sessions_with_hour_count print 'buy_sessons_hour_ratio:\n', buy_sessons_hour_ratio.values fig = plt.figure() ax1 = fig.add_subplot(111) ax1.plot(sessions_with_hour_count.values, color='b', label='total sessions') ax1.set_ylabel('total sessions') ax1.legend(loc=1) ax2 = ax1.twinx() ax2.plot(buy_sessions_with_hour_count, color='r', label='buy sessions') ax2.set_ylabel('buy sessions') ax2.legend(loc=2) plt.grid() ax2.set_xlim([0,23]) ax2.set_ylim([0,50000]) ax2.set_xticks(np.arange(0,24)) ax1.vlines(np.arange(0,24), [0], sessions_with_hour_count.values, linestyles='dotted') # it takes me about half an hour to run this cell, I really don't know how to speed up # if you know, please let me know: tianjun.cpp@gmail.com total.sort(['SesID', 'TimeStamp'], inplace=True) total['TimeDiff'] = total.groupby('SesID', sort=False)['TimeStamp'].transform(lambda x: x.diff()) print "max diff: ",total['TimeDiff'].max() # less than an hour total['TimeDiff'] = total['TimeDiff'].map(lambda x : x.microsecond / 10**6 + x.second + 60 * x.minute if x else None) total['TimeDiffRank'] = total.groupby('SesID', sort=False).cumcount() click_records = total[total['IsBuy']==False] buy_records = total[total['IsBuy']==True] click_time_diff_statistic = [click_records[click_records['TimeDiffRank']==i]['TimeDiff'] for i in range(1,100)] avg_click_timediff = [x.mean() for x in click_time_diff_statistic] plt.plot(np.arange(1,100), avg_click_timediff, color='b', label='avg time diff between two sequencial clicks in click sessions') buy_time_diff_statistic = [buy_records[buy_records['TimeDiffRank']==i]['TimeDiff'] for i in range(1,100)] avg_buy_timediff = [x.mean() for x in buy_time_diff_statistic] plt.plot(np.arange(1,100), avg_buy_timediff, color='r', label='avg time diff between two sequencial clicks in buy sessions') plt.legend() items_group_count['Category'].value_counts() click['Category'].value_counts() category_counts = category_group.count() category_counts.columns = ['Count'] category_counts.sort('Count',inplace=True, ascending=False) category_counts[category_counts['Count']>100].plot(kind='bar') items_group_count.columns=['Count'] items_group_count['Count'].value_counts().sort_index().plot(kind='bar') n_views_in_buyses = buy_sesID_group['ItemID'].apply(lambda x: len(x.unique())) n_views_in_buyses_stat = n_views_in_buyses.value_counts().sort_index() n_views_in_clickses = click_sesID_group['ItemID'].apply(lambda x: len(x.unique())) n_views_in_clickses_stat = n_views_in_clickses.value_counts().sort_index() n_views_in_buyses_stat = n_views_in_buyses_stat / n_views_in_buyses_stat.sum() # scale to [0,1] n_views_in_clickses_stat = n_views_in_clickses_stat / n_views_in_clickses_stat.sum() # scale to [0,1] g = pd.DataFrame({'click':n_views_in_buyses_stat[n_views_in_buyses_stat.index < 30 ], 'buy': n_views_in_clickses_stat[n_views_in_clickses_stat.index < 30 ]}) g.plot(kind='bar') click_item_group = click.groupby('ItemID') buy_item_group = buy.groupby('ItemID') item_click_count = click_item_group.count() item_buy_count = buy_item_group.count() item_buy_count_sorted = item_buy_count['TimeStamp'].copy() item_buy_count_sorted.sort(ascending=False) #plt.plot(np.log(item_buy_count_sorted)) click_groupby_category = click.groupby(['Category', click.TimeStamp.dt.dayofyear]) category_clickcount_daily = click_groupby_category.count() category_clickcount_daily.index.names = ['Category','Day'] category_clickcount_daily = category_clickcount_daily['SesID'] category_clickcount_daily = category_clickcount_daily.unstack() category_clickcount_daily.fillna(0, inplace=True) #category_clickcount_daily.reset_index(inplace=True) selected_category_clickcount = category_clickcount_daily[category_clickcount_daily.index.isin([str(i) for i in range(10)] + ['S'])] selected_category_clickcount.T.plot(kind='area') for i, label in enumerate(['Price','Quantity']): plt.subplot(2,1,i+1) haveprice_items = buy[~(buy[label] == 0)] haveprice_items_group = haveprice_items.groupby(haveprice_items.TimeStamp.dt.dayofyear) haveprice_items_day_count = haveprice_items_group.count()['SesID'] plt.bar(haveprice_items_day_count.index, haveprice_items_day_count.values, label='items have '+ label) noprice_items = buy[(buy[label] == 0)] noprice_items_group = noprice_items.groupby(noprice_items.TimeStamp.dt.dayofyear) noprice_items_day_count = noprice_items_group.count()['SesID'] plt.bar(noprice_items_day_count.index, noprice_items_day_count.values, color='r',label='items lost '+label) plt.legend() plt.xlim([91, 273]) plt.xlabel('day of year') plt.hist(buy['Price'].values, bins=np.arange(1, 10000, 100))