from IPython.core.display import Image Image(filename='./imgs/recsys_arch.png') import pandas as pd unames = ['user_id', 'username'] users = pd.read_table('./data/users_set.dat', sep='|', header=None, names=unames) rnames = ['user_id', 'course_id', 'rating'] ratings = pd.read_table('./data/ratings.dat', sep='|', header=None, names=rnames) mnames = ['course_id', 'title', 'avg_rating', 'workload', 'university', 'difficulty', 'provider'] courses = pd.read_table('./data/cursos.dat', sep='|', header=None, names=mnames) # show how one of them looks ratings.head(10) # show how one of them looks users[:5] courses[:5] coursetalk = pd.merge(pd.merge(ratings, courses), users) coursetalk coursetalk.ix[0] mean_ratings = coursetalk.pivot_table('rating', rows='provider', aggfunc='mean') mean_ratings.order(ascending=False) ratings_by_title = coursetalk.groupby('title').size() ratings_by_title[:10] active_titles = ratings_by_title.index[ratings_by_title >= 20] active_titles[:10] mean_ratings = coursetalk.pivot_table('rating', rows='title', aggfunc='mean') mean_ratings mean_ratings.ix[active_titles].order(ascending=False) mean_ratings = coursetalk.pivot_table('rating', rows='title',cols='provider', aggfunc='mean') mean_ratings[:10] mean_ratings['coursera'][active_titles].order(ascending=False)[:10] # transform the ratings frame into a ratings matrix ratings_mtx_df = coursetalk.pivot_table(values='rating', rows='user_id', cols='title') ratings_mtx_df.ix[ratings_mtx_df.index[:15], ratings_mtx_df.columns[:15]] ratings_gte_4 = ratings_mtx_df[ratings_mtx_df>=4.0] # with an integer axis index only label-based indexing is possible ratings_gte_4.ix[ratings_gte_4.index[:15], ratings_gte_4.columns[:15]] ratings_gte_4_pd = pd.DataFrame({'total': ratings_mtx_df.count(), 'gte_4': ratings_gte_4.count()}) ratings_gte_4_pd.head(10) ratings_gte_4_pd['gte_4_ratio'] = (ratings_gte_4_pd['gte_4'] * 1.0)/ ratings_gte_4_pd.total ratings_gte_4_pd.head(10) ranking = [(title,total,gte_4, score) for title, total, gte_4, score in ratings_gte_4_pd.itertuples()] for title, total, gte_4, score in sorted(ranking, key=lambda x: (x[3], x[2], x[1]) , reverse=True)[:10]: print title, total, gte_4, score ratings_by_title = coursetalk.groupby('title').size() ratings_by_title.order(ascending=False)[:10] for title, total, gte_4, score in sorted(ranking, key=lambda x: (x[2], x[3], x[1]) , reverse=True)[:10]: print title, total, gte_4, score course_users = coursetalk.pivot_table('rating', rows='title', cols='user_id') course_users.ix[course_users.index[:15], course_users.columns[:15]] ratings_by_course = coursetalk[coursetalk.title == 'An Introduction to Interactive Programming in Python'] ratings_by_course.set_index('user_id', inplace=True) their_ids = ratings_by_course.index their_ratings = course_users[their_ids] course_users[their_ids].ix[course_users[their_ids].index[:15], course_users[their_ids].columns[:15]] course_count = their_ratings.ix['An Introduction to Interactive Programming in Python'].count() sims = their_ratings.apply(lambda profile: profile.count() / float(course_count) , axis=1) sims.order(ascending=False)[1:][:10]