import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from __future__ import division
import pandas as pd
import sys
print "my python version: ", sys.version_info[:]
print "my pandas version: ", pd.__version__
print "my numpy version: ", np.version.version
plt.rcParams['figure.figsize'] = (15, 10) # set default fig size
click_file = './data/yoochoose-data/yoochoose-clicks.dat'
buy_file = './data/yoochoose-data/yoochoose-buys.dat'
click = pd.read_csv(click_file,
header=None, index_col=False, parse_dates=[1],
names=['SesID', 'TimeStamp','ItemID','Category'],
dtype={'SesID':str,
'TimeStamp':datetime,
'ItemID':str,
'Category': str})
buy = pd.read_csv(buy_file,
header=None,index_col=False, parse_dates=[1],
names=['SesID', 'TimeStamp', 'ItemID', 'Price', 'Quantity'],
dtype={'SesID':str,
'TimeStamp':datetime,
'ItemID':str,
'Price':float,
'Quantity':int})
my python version: (2, 7, 9, 'final', 0) my pandas version: 0.15.2 my numpy version: 1.9.1
Let's see some examples of click records and buy records:
click.tail()
SesID | TimeStamp | ItemID | Category | |
---|---|---|---|---|
33003939 | 11299809 | 2014-09-25 09:33:22.412000 | 214819412 | S |
33003940 | 11299809 | 2014-09-25 09:43:52.821000 | 214830939 | S |
33003941 | 11299811 | 2014-09-24 19:02:09.741000 | 214854855 | S |
33003942 | 11299811 | 2014-09-24 19:02:11.894000 | 214854838 | S |
33003943 | 11299811 | 2014-09-24 19:02:25.146000 | 214848658 | S |
buy.tail()
SesID | TimeStamp | ItemID | Price | Quantity | |
---|---|---|---|---|---|
1150748 | 11368701 | 2014-09-26 07:52:51.357000 | 214849809 | 554 | 2 |
1150749 | 11368691 | 2014-09-25 09:37:44.206000 | 214700002 | 6806 | 5 |
1150750 | 11523941 | 2014-09-25 06:14:47.965000 | 214578011 | 14556 | 1 |
1150751 | 11423202 | 2014-09-26 18:49:34.024000 | 214849164 | 1046 | 1 |
1150752 | 11423202 | 2014-09-26 18:49:34.026000 | 214560500 | 5549 | 1 |
First we join buy and click data together.
total = pd.concat([click[['TimeStamp','SesID','ItemID']],buy[['TimeStamp','SesID','ItemID']]],
ignore_index=True)
item2category = click[['ItemID','Category']].copy()
item2category.drop_duplicates(inplace=True)
print 'length of item2category :',len(item2category)
print 'total num of items: ', len(total['ItemID'].drop_duplicates())
# len(item2category) >= len(total['ItemID'].drop_duplicates())
# this means some item belongs to multiple category,
# about 2categories per item
length of item2category : 100110 total num of items: 52739
items_group = item2category.groupby('ItemID')
category_group = item2category.groupby('Category')
items_group_count = items_group.count()
category_group_count = category_group.count()
buy_sessions = buy['SesID'].drop_duplicates()
total['IsBuy'] = total['SesID'].isin(buy_sessions)
total_sesID_group = total.groupby('SesID')
click_sesID_group = total[total['IsBuy'] == False].groupby('SesID')
buy_sesID_group = total[total['IsBuy'] == True].groupby('SesID')
n_sessions = len(total_sesID_group)
n_click_sessions = len(click_sesID_group)
n_buy_sessions = len(buy_sesID_group)
total_records = len(total)
n_clicks = (total['IsBuy'] == False).sum()
n_buys = (total['IsBuy'] == True).sum()
print "number of records: in click sessions:", n_clicks
print "number of records in buy sessions:", n_buys
print "total records: ", total_records
print 'n_click_sessions: ', n_click_sessions
print "total sessions: ", n_sessions
print 'n_buy_sessions: ', n_buy_sessions
print "avg clicks per_session: ", n_clicks / n_click_sessions
print "avg clicks(including buys) per_session: ", n_buys / n_buy_sessions
number of records: in click sessions: 29698257 number of records in buy sessions: 4456440 total records: 34154697 n_click_sessions: 9249729 total sessions: 9249729 n_buy_sessions: 509696 avg clicks per_session: 3.21071644369 avg clicks(including buys) per_session: 8.74332935711
From the above data, we know that
all sessions in buy data also appear in click data: len(set(click_sessions + buy_sessions) = len(set(click_sessions))
avg clicks in sessions(which finally lead to buy sth) are much more than avg click in click sessions.
total_sesID_group_count = total_sesID_group['TimeStamp'].count()
click_sesID_group_count = click_sesID_group['TimeStamp'].count()
buy_sesID_group_count = buy_sesID_group['TimeStamp'].count()
max_count = total_sesID_group_count.max(axis=0)
total_sesID_group_count.plot(kind='hist', bins=np.arange(max_count))
<matplotlib.axes._subplots.AxesSubplot at 0x7fe2b112a250>
The fig above is not so intuitive. We can only know that most sessions have less than 10 actions (click/buy). So we cut out bins greater than 25. And this time we only show the hist of clicks and buy sessions' length.
click_bins, _ = np.histogram(click_sesID_group_count.values , bins=np.arange(1, 25))
buy_bins, _ = np.histogram(buy_sesID_group_count.values , bins=np.arange(1, 25))
# scale to [0, 1]
click_bins = click_bins / click_bins.sum()
buy_bins = buy_bins / buy_bins.sum()
plt.bar(np.arange(1, 24)-0.4, click_bins, 0.4, color='b', label='click')
plt.bar(np.arange(1, 24), buy_bins, 0.4, color='g', label='buy')
plt.legend()
plt.xticks(np.arange(1, 24),np.arange(1, 24))
([<matplotlib.axis.XTick at 0x7fe2bb0d6c90>, <matplotlib.axis.XTick at 0x7fe2bb0d6dd0>, <matplotlib.axis.XTick at 0x7fe2baec8850>, <matplotlib.axis.XTick at 0x7fe2baec8e50>, <matplotlib.axis.XTick at 0x7fe2baed05d0>, <matplotlib.axis.XTick at 0x7fe2baed0d10>, <matplotlib.axis.XTick at 0x7fe2baede490>, <matplotlib.axis.XTick at 0x7fe2baedebd0>, <matplotlib.axis.XTick at 0x7fe2bae66350>, <matplotlib.axis.XTick at 0x7fe2bae66a90>, <matplotlib.axis.XTick at 0x7fe2bae70210>, <matplotlib.axis.XTick at 0x7fe2bae70950>, <matplotlib.axis.XTick at 0x7fe2bae7b0d0>, <matplotlib.axis.XTick at 0x7fe2bae7b810>, <matplotlib.axis.XTick at 0x7fe2bae7bf50>, <matplotlib.axis.XTick at 0x7fe2bae856d0>, <matplotlib.axis.XTick at 0x7fe2bae85e10>, <matplotlib.axis.XTick at 0x7fe2bae8f590>, <matplotlib.axis.XTick at 0x7fe2bae8fcd0>, <matplotlib.axis.XTick at 0x7fe2bae98450>, <matplotlib.axis.XTick at 0x7fe2bae98b90>, <matplotlib.axis.XTick at 0x7fe2bae23310>, <matplotlib.axis.XTick at 0x7fe2bae23a50>], <a list of 23 Text xticklabel objects>)
Sounds interesting. It's clear that people who finally buy sth would like to click more times. Maybe they are comparing with some related items?
Next, we draw click(buy) session each day/month, and also buy/click ratio.
sessions_with_one_record = total_sesID_group.first()
buy_sessions_with_one_record = buy_sesID_group.first()
sessions_with_month = sessions_with_one_record['TimeStamp'].apply(lambda x: x.strftime('%Y-%m'))
sessions_with_month_count = sessions_with_month.value_counts().sort_index()
buy_sessions_with_month = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.strftime('%Y-%m'))
buy_sessions_with_month_count = buy_sessions_with_month.value_counts().sort_index()
buy_sessons_monthly_ratio = buy_sessions_with_month_count /sessions_with_month_count
ax = sessions_with_month_count.plot(kind='bar', alpha=0.5, color='b', label='click sessions')
buy_sessions_with_month_count.plot(kind='bar', ax=ax, alpha=0.5, color='y', label='buy sessions')
plt.legend(loc=2)
ax1 = ax.twinx()
buy_sessons_monthly_ratio.plot(ax=ax1, color='r',marker='o',markersize=5, label='buy/click ratio')
ax1.set_ylim([0, 0.07])
plt.legend()
print 'buy_sessons_monthly_ration:\n', buy_sessons_monthly_ratio.values
print 'mean: ', buy_sessons_monthly_ratio.values.mean(), ' std: ', buy_sessons_monthly_ratio.values.std()
buy_sessons_monthly_ration: [ 0.05987775 0.05767783 0.0547208 0.0524773 0.05327894 0.05268662] mean: 0.0551198724794 std: 0.00275779664857
sessions_with_week = sessions_with_one_record['TimeStamp'].apply(lambda x: x.timetuple().tm_yday // 7)
sessions_with_week_count = sessions_with_week.value_counts().sort_index()
buy_sessions_with_week = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.timetuple().tm_yday // 7)
buy_sessions_with_week_count = buy_sessions_with_week.value_counts().sort_index()
sessions_with_week_count.index
Int64Index([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], dtype='int64')
buy_sessions_week_ratio = buy_sessions_with_week_count /sessions_with_week_count
fig, ax1 = plt.subplots()
ax1.bar(sessions_with_week_count.index -0.4, sessions_with_week_count.values, 0.8, alpha=0.5, color='b', label='click sessions')
ax1.bar(buy_sessions_with_week_count.index -0.4, buy_sessions_with_week_count.values, 0.8, alpha=0.5, color='y', label='buy sessions')
plt.legend(loc=2)
ax1 = ax1.twinx()
ax1.plot(sessions_with_week_count.index, buy_sessions_week_ratio.values, color='r',marker='o',markersize=5, label='buy/click ratio')
ax1.set_ylim([0, 0.07])
mean_ration = buy_sessons_week_ratio.values.mean()
plt.axhline(mean_ration,color='r', linestyle='dotted', label='avg ratio:{:.3f}'.format(mean_ration))
plt.legend(loc=1)
print 'buy_sessons_week_ration:\n', buy_sessons_week_ratio.values
print 'mean: ', mean_ration, ' std: ', buy_sessons_week_ratio.values.std()
buy_sessons_week_ration: [ 0.06583851 0.0634527 0.06071765 0.05451357 0.04906384 0.0693577 0.06308127 0.04445721 0.05527256 0.05167474 0.0519934 0.05987153 0.05694732 0.05692821 0.05605115 0.05776482 0.04654129 0.05246112 0.05878865 0.04847408 0.05017729 0.05448669 0.04794499 0.04795683 0.06423523 0.04810544 0.02657807] mean: 0.0541754020361 std: 0.00834434971228
sessions_with_day = sessions_with_one_record['TimeStamp'].apply(lambda x: x.date())
sessions_with_day_count = sessions_with_day.value_counts().sort_index()
buy_sessions_with_day = buy_sessions_with_one_record['TimeStamp'].apply(lambda x: x.date())
buy_sessions_with_day_count = buy_sessions_with_day.value_counts().sort_index()
plt.title("[0,max] scaled sessions_per_day statistics")
plt.plot(sessions_with_day_count.values / max(sessions_with_day_count.values), label="all_sessions_per_day")
plt.plot(buy_sessions_with_day_count.values / max(buy_sessions_with_day_count.values), label="buy_sessions_per_day")
plt.plot(buy_sessions_with_day_count.values / sessions_with_day_count.values, label="ratio")
plt.vlines(np.arange(0, len(sessions_with_day_count), 7), [0], [1], linestyles='dotted', label='Tuesday')
plt.legend(loc=2)
plt.xlim([0,len(sessions_with_day_count)])
(0, 183)
sessions_with_weekday = sessions_with_one_record['TimeStamp'].dt.weekday
sessions_with_weekday_count = sessions_with_weekday.value_counts().sort_index()
buy_sessions_with_weekday = buy_sessions_with_one_record['TimeStamp'].dt.weekday
buy_sessions_with_weekday_count = buy_sessions_with_weekday.value_counts().sort_index()
buy_sessions_weekday_ratio = buy_sessions_with_weekday_count / sessions_with_weekday_count
fig, ax1 = plt.subplots()
ax1.bar(np.arange(7)-0.4, sessions_with_weekday_count.values, 0.8, alpha=0.5, color='b', label='click sessions')
ax1.bar(np.arange(7)-0.4, buy_sessions_with_weekday_count.values, 0.8, alpha=0.5, color='y', label='buy sessions')
plt.xticks(np.arange(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.legend(loc=2)
ax1.set_ylim([0, 2200000])
ax2 = ax1.twinx()
ax2.plot(np.arange(7), buy_sessions_weekday_ratio.values, color='r',marker='o',markersize=5, label='buy/click ratio')
ax2.set_ylim([0, 0.07])
plt.legend(loc=1)
print 'buy_sessons_weekday_ration:\n', buy_sessons_weekday_ratio.values
buy_sessons_weekday_ration: [ 0.05262548 0.03241294 0.04989819 0.05217814 0.05651848 0.06887879 0.0624327 ]
sessions_with_hour = sessions_with_one_record['TimeStamp'].dt.hour
sessions_with_hour_count = sessions_with_hour.value_counts().sort_index()
buy_sessions_with_hour = buy_sessions_with_one_record['TimeStamp'].dt.hour
buy_sessions_with_hour_count = buy_sessions_with_hour.value_counts().sort_index()
buy_sessons_hour_ratio = buy_sessions_with_hour_count /sessions_with_hour_count
print 'buy_sessons_hour_ratio:\n', buy_sessons_hour_ratio.values
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(sessions_with_hour_count.values, color='b', label='total sessions')
ax1.set_ylabel('total sessions')
ax1.legend(loc=1)
ax2 = ax1.twinx()
ax2.plot(buy_sessions_with_hour_count, color='r', label='buy sessions')
ax2.set_ylabel('buy sessions')
ax2.legend(loc=2)
plt.grid()
ax2.set_xlim([0,23])
ax2.set_ylim([0,50000])
ax2.set_xticks(np.arange(0,24))
ax1.vlines(np.arange(0,24), [0], sessions_with_hour_count.values, linestyles='dotted')
buy_sessons_hour_ratio: [ 0.02316928 0.02389155 0.02407361 0.02942573 0.03565868 0.04101091 0.05099876 0.05687048 0.05938213 0.05720923 0.0574788 0.05945185 0.05788164 0.05830583 0.05943467 0.06231237 0.06260238 0.06418852 0.06259432 0.05286058 0.04320709 0.03632203 0.02990141 0.02466328]
<matplotlib.collections.LineCollection at 0x7fe1002befd0>
In the next part, we will explore the timestamp of a session.
# it takes me about half an hour to run this cell, I really don't know how to speed up
# if you know, please let me know: tianjun.cpp@gmail.com
total.sort(['SesID', 'TimeStamp'], inplace=True)
total['TimeDiff'] = total.groupby('SesID', sort=False)['TimeStamp'].transform(lambda x: x.diff())
print "max diff: ",total['TimeDiff'].max() # less than an hour
max diff: 1970-01-01 00:59:59.995000
total['TimeDiff'] = total['TimeDiff'].map(lambda x : x.microsecond / 10**6 + x.second + 60 * x.minute if x else None)
total['TimeDiffRank'] = total.groupby('SesID', sort=False).cumcount()
click_records = total[total['IsBuy']==False]
buy_records = total[total['IsBuy']==True]
click_time_diff_statistic = [click_records[click_records['TimeDiffRank']==i]['TimeDiff'] for i in range(1,100)]
avg_click_timediff = [x.mean() for x in click_time_diff_statistic]
plt.plot(np.arange(1,100), avg_click_timediff, color='b', label='avg time diff between two sequencial clicks in click sessions')
buy_time_diff_statistic = [buy_records[buy_records['TimeDiffRank']==i]['TimeDiff'] for i in range(1,100)]
avg_buy_timediff = [x.mean() for x in buy_time_diff_statistic]
plt.plot(np.arange(1,100), avg_buy_timediff, color='r', label='avg time diff between two sequencial clicks in buy sessions')
plt.legend()
<matplotlib.legend.Legend at 0x7fe266623250>
items_group_count['Category'].value_counts()
2 25407 1 17511 3 7740 4 1859 5 203 6 19 dtype: int64
click['Category'].value_counts()
0 16337653 S 10769610 1 1671754 2 1292249 3 789713 4 480569 5 471923 6 414696 7 389910 9 105282 11 70264 10 69820 8 44840 12 19357 2089046251 7100 ... 2089134695 1 2089156240 1 2089286957 1 2089179505 1 2089580928 1 2089084275 1 2088962151 1 2089028904 1 2089251022 1 2089759631 1 2089508713 1 2089160270 1 2088970021 1 193464283 1 193465576 1 Length: 339, dtype: int64
category_counts = category_group.count()
category_counts.columns = ['Count']
category_counts.sort('Count',inplace=True, ascending=False)
category_counts[category_counts['Count']>100].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7ff101099210>
items_group_count.columns=['Count']
items_group_count['Count'].value_counts().sort_index().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7ff10dc2c210>
n_views_in_buyses = buy_sesID_group['ItemID'].apply(lambda x: len(x.unique()))
n_views_in_buyses_stat = n_views_in_buyses.value_counts().sort_index()
n_views_in_clickses = click_sesID_group['ItemID'].apply(lambda x: len(x.unique()))
n_views_in_clickses_stat = n_views_in_clickses.value_counts().sort_index()
n_views_in_buyses_stat = n_views_in_buyses_stat / n_views_in_buyses_stat.sum() # scale to [0,1]
n_views_in_clickses_stat = n_views_in_clickses_stat / n_views_in_clickses_stat.sum() # scale to [0,1]
g = pd.DataFrame({'click':n_views_in_buyses_stat[n_views_in_buyses_stat.index < 30 ],
'buy': n_views_in_clickses_stat[n_views_in_clickses_stat.index < 30 ]})
g.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7ff10bbfdcd0>
Different items' click rate
click_item_group = click.groupby('ItemID')
buy_item_group = buy.groupby('ItemID')
item_click_count = click_item_group.count()
item_buy_count = buy_item_group.count()
item_buy_count_sorted = item_buy_count['TimeStamp'].copy()
item_buy_count_sorted.sort(ascending=False)
#plt.plot(np.log(item_buy_count_sorted))
0.64997962203878679
click_groupby_category = click.groupby(['Category', click.TimeStamp.dt.dayofyear])
category_clickcount_daily = click_groupby_category.count()
category_clickcount_daily.index.names = ['Category','Day']
category_clickcount_daily = category_clickcount_daily['SesID']
category_clickcount_daily = category_clickcount_daily.unstack()
category_clickcount_daily.fillna(0, inplace=True)
#category_clickcount_daily.reset_index(inplace=True)
selected_category_clickcount = category_clickcount_daily[category_clickcount_daily.index.isin([str(i) for i in range(10)] + ['S'])]
selected_category_clickcount.T.plot(kind='area')
<matplotlib.axes._subplots.AxesSubplot at 0x7feff3fe6f50>
for i, label in enumerate(['Price','Quantity']):
plt.subplot(2,1,i+1)
haveprice_items = buy[~(buy[label] == 0)]
haveprice_items_group = haveprice_items.groupby(haveprice_items.TimeStamp.dt.dayofyear)
haveprice_items_day_count = haveprice_items_group.count()['SesID']
plt.bar(haveprice_items_day_count.index, haveprice_items_day_count.values, label='items have '+ label)
noprice_items = buy[(buy[label] == 0)]
noprice_items_group = noprice_items.groupby(noprice_items.TimeStamp.dt.dayofyear)
noprice_items_day_count = noprice_items_group.count()['SesID']
plt.bar(noprice_items_day_count.index, noprice_items_day_count.values, color='r',label='items lost '+label)
plt.legend()
plt.xlim([91, 273])
plt.xlabel('day of year')
The above data shows that 610030 records have a Quantity
and Price
of 0!!! It has been discussed on twitter that:
if it's in buys, it has been bought at least once however if we didn't have the price or the quantity we wrote 0
plt.hist(buy['Price'].values, bins=np.arange(1, 10000, 100))
(array([ 1.17500000e+03, 3.05500000e+03, 9.01400000e+03, 2.14030000e+04, 2.61540000e+04, 3.85760000e+04, 2.98770000e+04, 3.85280000e+04, 3.07400000e+04, 1.86680000e+04, 5.86060000e+04, 9.26500000e+03, 3.72900000e+03, 2.74740000e+04, 5.57500000e+03, 1.49660000e+04, 1.03840000e+04, 3.65800000e+03, 6.83200000e+03, 3.30100000e+03, 2.40530000e+04, 7.60000000e+02, 7.70000000e+01, 8.24000000e+03, 2.40200000e+03, 9.12000000e+02, 1.19120000e+04, 2.02000000e+03, 1.15600000e+03, 2.79200000e+03, 7.19000000e+02, 1.11070000e+04, 8.30000000e+01, 3.44900000e+03, 1.19600000e+03, 1.17000000e+02, 5.98700000e+03, 8.88000000e+02, 7.98000000e+02, 8.57000000e+02, 3.24000000e+02, 1.78210000e+04, 1.60000000e+01, 1.98200000e+03, 1.00000000e+00, 3.99000000e+02, 3.80000000e+01, 2.18000000e+03, 7.30000000e+01, 1.32000000e+02, 2.95000000e+02, 8.80000000e+01, 1.02380000e+04, 3.00000000e+00, 2.10000000e+01, 1.73000000e+03, 7.40000000e+01, 1.28100000e+03, 5.60000000e+01, 9.70000000e+01, 1.36000000e+02, 4.03000000e+02, 7.66600000e+03, 5.00000000e+00, 3.40000000e+01, 1.23600000e+03, 0.00000000e+00, 8.00000000e+01, 1.58200000e+03, 1.90000000e+01, 6.00000000e+00, 1.25000000e+02, 4.80000000e+01, 4.84700000e+03, 3.00000000e+00, 2.60000000e+01, 7.37000000e+02, 2.00000000e+00, 9.59000000e+02, 4.00000000e+00, 3.80000000e+01, 1.03000000e+02, 1.80000000e+01, 5.07100000e+03, 0.00000000e+00, 1.30000000e+01, 2.30000000e+01, 7.10000000e+01, 8.24000000e+02, 0.00000000e+00, 1.90000000e+01, 1.80000000e+01, 7.20000000e+01, 1.70000000e+01, 4.24300000e+03, 0.00000000e+00, 1.00000000e+00, 6.60000000e+01, 3.00000000e+00]), array([ 1, 101, 201, 301, 401, 501, 601, 701, 801, 901, 1001, 1101, 1201, 1301, 1401, 1501, 1601, 1701, 1801, 1901, 2001, 2101, 2201, 2301, 2401, 2501, 2601, 2701, 2801, 2901, 3001, 3101, 3201, 3301, 3401, 3501, 3601, 3701, 3801, 3901, 4001, 4101, 4201, 4301, 4401, 4501, 4601, 4701, 4801, 4901, 5001, 5101, 5201, 5301, 5401, 5501, 5601, 5701, 5801, 5901, 6001, 6101, 6201, 6301, 6401, 6501, 6601, 6701, 6801, 6901, 7001, 7101, 7201, 7301, 7401, 7501, 7601, 7701, 7801, 7901, 8001, 8101, 8201, 8301, 8401, 8501, 8601, 8701, 8801, 8901, 9001, 9101, 9201, 9301, 9401, 9501, 9601, 9701, 9801, 9901]), <a list of 99 Patch objects>)