import numpy as np
import matplotlib.pyplot as plt
%pylab inline
from MERhelpers import *
matplotlib.pyplot.xkcd(scale=0.5, length=100, randomness=5)
Populating the interactive namespace from numpy and matplotlib
<matplotlib.rc_context at 0x105e6c310>
mainColour = '#2F4F4F';
secondColour1 = '#666666';
secondColour2 = '#7491A3';
dict_all,date_clicks,num_clicks = data_to_dict_clickdates_clickscount('Analytics_Winter2014.csv')
wiki_courses = ['MATH100','MATH101','MATH102','MATH103','MATH104','MATH105','MATH110',
'MATH152','MATH200','MATH215','MATH220','MATH221','MATH257','MATH437']
clean_dict(dict_all,wiki_courses)
courses = list_courses(dict_all)
page_views = [];
total_time = [];
pages_per_visit = [];
for course in courses:
pv = 0;
pgs = [];
t = [];
exams = list_exams(dict_all,course)
for exam in exams:
questions = list_questions(dict_all,course,exam)
for q in questions:
pv = pv + dict_all[course][exam][q]['pageviews']
pgs.append(dict_all[course][exam][q]['pages_per_visit'])
t.append(dict_all[course][exam][q]['avg_time'])
pages_per_visit.append(mean(pgs))
page_views.append(pv)
total_time.append(mean(t))
y_pos = np.arange(len(courses))
total_time
plt.figure(figsize(7,5))
plt.subplots_adjust(right = 2)
plt.subplot(131)
plt.barh(y_pos, page_views, align='center', alpha=0.9,color=mainColour)
plt.yticks(y_pos, courses)
plt.grid()
plt.xlabel('number of page views')
plt.title('Absolute page views by course')
plt.subplot(132)
plt.barh(y_pos, total_time, align='center', alpha=0.9,color=secondColour1)
plt.yticks(y_pos, courses)
plt.grid()
plt.xlabel('time (s)')
plt.title('Average question page viewing time by course')
plt.subplot(133)
plt.barh(y_pos, pages_per_visit, align='center', alpha=0.9,color=secondColour2)
plt.yticks(y_pos, courses)
plt.grid()
plt.xlabel('number of pages per session')
plt.title('Average number of page views per session by course')
plt.show()
The plot below is simply a scatter plot comparing the total number of visits a page has compared to the number of pages viewed per session.
def get_years(course_list):
years = []
for course in course_list:
years.append(int(course[4]))
return years
years = get_years(courses)
x = page_views
y = pages_per_visit
area = total_time
colors = years
plt.figure(figsize(6,4))
c = plt.scatter(x,y,s=area,c=colors)
c.set_alpha(0.3)
plt.grid()
#for i, txt in enumerate(courses):
# plt.annotate(txt, (x[i],y[i]),rotation=-5)
#plt.xlim([-15000,190000])
#plt.ylim([0,35])
plt.xlabel('Total page views')
plt.ylabel('Avg. pages per session')
plt.title('Page views vs. Pages/Session')
plt.show()
# Take two questions from the same exam and determine from their titles if they are the same question
def isSameQuestion(q1,q2):
num1 = int(''.join(x for x in q1 if x.isdigit()))
num2 = int(''.join(x for x in q2 if x.isdigit()))
samenum = num1 == num2
let1 = ''.join(x for x in q1 if x.islower())
let2 = ''.join(x for x in q2 if x.islower())
samelet = let1 == let2
if samenum and samelet:
return True
else:
return False
# Will return data the combined data of two question pages.
## WARNING: THIS IS NOT FINISHED YET AND MORE WORK MAY BE NEEDED TO PROPERLY BLEND THE QUESTION DATA.
def combine_two_questions_data(q1,q2):
pgviews = q1['pageviews'] + q2['pageviews']
avgtime = (q1['avg_time']*q1['pageviews'] + q2['avg_time']*q2['pageviews'])/pgviews
return pgviews, avgtime
def clean_exam(my_dict,course,exam):
#Remove pages that have less than a certain number of views
questions = list_questions(my_dict,course,exam)
for question in questions:
pgv = my_dict[course][exam][question]['pageviews']
if pgv < 30:
del my_dict[course][exam][question]
#Merge questions that are the same, but were labeled using a different scheme previously
questions = list_questions(my_dict,course,exam)
for q1 in questions:
temp_array = questions[questions.index(q1)+1:]
for q2 in temp_array:
if isSameQuestion(q1,q2):
pgv,avt = combine_two_questions_data(my_dict[course][exam][q1],my_dict[course][exam][q2])
my_dict[course][exam][q1]['pageviews'] = pgv
my_dict[course][exam][q1]['avg_time'] = avt
del my_dict[course][exam][q2]
def get_question_data_array(mydict,course,exam,question_num):
avg_time = []
visit_duration = []
pageviews = []
unique_pageviews = []
pages_per_visit = []
questions = list_questions(mydict,course,exam)
for question in questions:
pgv = mydict[course][exam][question]['pageviews']
if pgv > 29:
num = int(''.join(x for x in question if x.isdigit()))
if num == question_num:
avg_time.append(mydict[course][exam][question]['avg_time'])
visit_duration.append(mydict[course][exam][question]['visit_duration'])
pageviews.append(mydict[course][exam][question]['pageviews'])
unique_pageviews.append(mydict[course][exam][question]['unique_pageviews'])
pages_per_visit.append(mydict[course][exam][question]['pages_per_visit'])
return avg_time,visit_duration, pageviews, unique_pageviews, pages_per_visit
def get_question_data_array_for_exam(mydict,course,exam):
count = 1;
avg_time_array = []
visit_duration_array = []
pageviews_array = []
unique_pageviews_array = []
pages_per_visit_array = []
avt,vdr,pvs,upv,ppv = get_question_data_array(mydict,course,exam,count)
while not len(avt) == 0:
avg_time_array.append(avt)
visit_duration_array.append(vdr)
pageviews_array.append(pvs)
unique_pageviews_array.append(upv)
pages_per_visit_array.append(ppv)
count = count + 1;
avt,vdr,pvs,upv,ppv = get_question_data_array(mydict,course,exam,count)
return avg_time_array, visit_duration_array, pageviews_array, unique_pageviews_array, pages_per_visit_array
def plot_question_info(course,exam):
avt,vdr,pvs,upv,ppv = get_question_data_array_for_exam(dict_all,course,exam)
count = 0
pos_final = 0
rects = []
colors = [mainColour,secondColour1,secondColour2];
colors = colors + colors; colors = colors + colors;
xt = []
for pv,at in zip(pvs,avt):
positions = np.cumsum(at)
positions = [0] + list(positions[:-1])
positions = [p + pos_final for p in positions]
heights = pv
bar_width = at
rects.append(plt.bar(positions,heights,bar_width,color=colors[count],alpha=0.9))
pos_final = pos_final + sum(at)
xt.append(pos_final - sum(at)/2)
count = count + 1
plt.xticks(xt, [ 'Q' + str(y) for y in range(1,20)])
plt.xlim([0,pos_final])
plt.title(course + ', '+ exam + ' (average total viewing time = '+ str(round(float(pos_final)/60,1)) + ' mins)')
plt.show()
Below are plots of exam data. The width of each box is proportional to the average viewing time a questions receives while the height is equal to the total number of clicks a question has received since it was written.
course_list = ['MATH101','MATH103','MATH105','MATH257','MATH220']
exam_list = ['April_2012','April_2012','April_2012','December_2011','April_2011'];
for cour,exam in zip(course_list,exam_list):
plt.figure(figsize(13,4))
clean_exam(dict_all,cour,exam)
plot_question_info(cour,exam)