The explanation of this implementation can be found at: http://www.rosariomgomez.me/
Index
import numpy as np
import pandas as pd
from create_features import create_pin_features, create_user_features
def get_db():
from pymongo import MongoClient
client = MongoClient('server', port) #server, port
db = client.database_name #database name
db.authenticate("user", "pwd")
return db
db = get_db()
#retrieve all the ratings from the DB
rated_outfits = db.ratings.find()
list_ratings = [rate for rate in rated_outfits]
#retrieve all the users from the DB and build the feature vectors
all_users = db.user.find()
list_users = [create_user_features(user) for user in all_users]
#retrieve all items from the DB and build the feature vectors
all_pins = db.fullpin.find()
list_pins = [create_pin_features(pin) for pin in all_pins]
#build the pandas items dataframe
items = pd.DataFrame(list_pins)
items = items.rename(columns = {'_id':'pin_id'}) #to be in line with the ratings names
items.head()
pin_id | blog_name | blogger_age | blogger_body_shape_apple | blogger_body_shape_hourglass | blogger_body_shape_inverted_triangle | blogger_body_shape_pear | blogger_body_shape_rectangle | blogger_dress_size | blogger_style | blogger_style_bohemian chic | blogger_style_casual chic | blogger_style_classic | blogger_style_edgy | blogger_style_preppy | blogger_style_romantic | brands_ASOS | brands_Abercrombie & Fitch | brands_Accessorize | brands_Alexander McQueen | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 537934c861e01f10f1118dea | Hallie Daily | 40 | 0 | 1 | 0 | 0 | 0 | 6 | [classic, romantic] | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... |
1 | 537934d261e01f10f1118e1b | Hallie Daily | 40 | 0 | 1 | 0 | 0 | 0 | 6 | [classic] | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
2 | 537934cc61e01f10f1118e00 | Hallie Daily | 40 | 0 | 1 | 0 | 0 | 0 | 6 | [classic] | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
3 | 537934c661e01f10f1118ddf | Hallie Daily | 40 | 0 | 1 | 0 | 0 | 0 | 6 | [classic, romantic] | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... |
4 | 537934c661e01f10f1118ddd | Hallie Daily | 40 | 0 | 1 | 0 | 0 | 0 | 6 | [classic] | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
5 rows × 319 columns
#build the ratings dataframe
cols = ['user_id', 'pin_id', 'rating']
ratings = pd.DataFrame(list_ratings, columns=cols)
ratings.head()
user_id | pin_id | rating | |
---|---|---|---|
0 | 538677f561e01f0be9e838f7 | 537933bb61e01f10f111886f | 0 |
1 | 538677f561e01f0be9e838f7 | 537933a161e01f10f11187e6 | 0 |
2 | 538677f561e01f0be9e838f7 | 53793d6d61e01f10f111a725 | 2 |
3 | 538677f561e01f0be9e838f7 | 537933be61e01f10f111887f | 2 |
4 | 538677f561e01f0be9e838f7 | 53793d8861e01f10f111a741 | 0 |
5 rows × 3 columns
#build the pandas users dataframe
users = pd.DataFrame(list_users)
users = users.rename(columns = {'_id':'user_id'}) #to be in line with the ratings names
users.head()
user_id | age | country | day_off | dress_size | fashionista | like_styles_pref | like_styles_pref_bohemian chic | like_styles_pref_casual chic | like_styles_pref_classic | like_styles_pref_edgy | like_styles_pref_preppy | like_styles_pref_romantic | nolike_styles_pref | nolike_styles_pref_bohemian chic | nolike_styles_pref_casual chic | nolike_styles_pref_classic | nolike_styles_pref_edgy | nolike_styles_pref_preppy | nolike_styles_pref_romantic | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53962dfa3191490008a690df | 55 | US | sport | 10 | nolike | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [] | 0 | 0 | 0 | 0 | 0 | 0 | ... |
1 | 53968e993d4e0c0007a2546f | 50 | US | family | 6 | nolike | [] | 0 | 0 | 0 | 0 | 0 | 0 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
2 | 53971683b7d85a0008b1bbe2 | 30 | ES | family | 8 | nolike | [romantic, casual chic] | 0 | 1 | 0 | 0 | 0 | 1 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
3 | 539851a17f6ba70007ba8bdb | 30 | ES | party | 10 | nolike | [romantic, casual chic, preppy] | 0 | 1 | 0 | 0 | 1 | 1 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
4 | 539770c4a9a4570008c28a9b | 45 | ES | family | 10 | ok | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [bohemian chic, edgy] | 1 | 0 | 0 | 1 | 0 | 0 | ... |
5 rows × 27 columns
#merge ratings, items and users dataframes
fashion = pd.merge(pd.merge(ratings, users), items)
fashion.head()
user_id | pin_id | rating | age | country | day_off | dress_size | fashionista | like_styles_pref | like_styles_pref_bohemian chic | like_styles_pref_casual chic | like_styles_pref_classic | like_styles_pref_edgy | like_styles_pref_preppy | like_styles_pref_romantic | nolike_styles_pref | nolike_styles_pref_bohemian chic | nolike_styles_pref_casual chic | nolike_styles_pref_classic | nolike_styles_pref_edgy | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 538677f561e01f0be9e838f7 | 537933bb61e01f10f111886f | 0 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
1 | 539604e40aa8e20007a976fb | 537933bb61e01f10f111886f | 2 | 30 | ES | read | 8 | ok | [casual chic] | 0 | 1 | 0 | 0 | 0 | 0 | [] | 0 | 0 | 0 | 0 | ... |
2 | 539616880aa8e20008b99e19 | 537933bb61e01f10f111886f | 2 | 25 | ES | family | 6 | ok | [casual chic, preppy] | 0 | 1 | 0 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
3 | 539628df3191490008a690d5 | 537933bb61e01f10f111886f | 2 | 40 | US | sport | 10 | ok | [classic] | 0 | 0 | 1 | 0 | 0 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
4 | 5396d3a6989960000db79381 | 537933bb61e01f10f111886f | 1 | 30 | ES | party | 10 | nolike | [classic, casual chic] | 0 | 1 | 1 | 0 | 0 | 0 | [bohemian chic, edgy] | 1 | 0 | 0 | 1 | ... |
5 rows × 347 columns
#20% of each user data for testing
def assign_to_set(df):
'''Randomly select 20% of indices from the dataframe and set the for_testing column to True
Input: dataframe
Output: dataframe'''
np.random.seed(1)
sampled_ids = np.random.choice(df.index, size=np.int64(np.ceil(df.index.size * 0.2)), replace=False)
df.ix[sampled_ids, 'for_testing'] = True
return df
fashion['for_testing'] = False
grouped = fashion.groupby('user_id', group_keys=False).apply(assign_to_set)
fashion_train = fashion[grouped.for_testing == False]
fashion_test = fashion[grouped.for_testing == True]
print fashion.shape
print fashion_train.shape
print fashion_test.shape
assert len(fashion_train.index & fashion_test.index) == 0 #ensure we don't have the same values on both sets
(2245, 348) (1783, 348) (462, 348)
#using RMSE as performance criterion
def compute_rmse(y_pred, y_true):
'''Calculate the root mean square value between the predicted and true rating
Input: predicted rating, true rating
Output: RMSE'''
return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))
def evaluate(estimate_f):
'''Calculate the RMSE to the passed as parameter recommendation function
Input: function to predict the rating of a tuple (user, item)
Output: (float) RMSE'''
ids_to_estimate = zip(fashion_test.user_id, fashion_test.pin_id) #list of tuples (user_id, pin_id)
estimated = np.array([estimate_f(u,p) for u,p in ids_to_estimate]) #apply the passed estimate function to the user,pin tuple
real = fashion_test.rating.values
return compute_rmse(estimated, real)
def estimate1(user_id, pin_id):
'''mean of user ratings'''
user_condition = fashion_train.user_id == user_id
return fashion_train.loc[user_condition, 'rating'].mean()
print 'RMSE for estimate1: %s' % evaluate(estimate1)
RMSE for estimate1: 0.834525165107
#new items pivot table where the index is the item_id for getting the user item blogger
items_info = items.set_index('pin_id')
means_by_blogger = fashion_train.pivot_table(values='rating', rows='pin_id', cols='blog_name')
def estimate2(user_id, pin_id):
'''mean rating of same blogger'''
pin_blogger = items_info.ix[pin_id, 'blog_name']
if pin_blogger in means_by_blogger.columns:
return means_by_blogger.ix[:, pin_blogger].mean() #mean value for that blogger
else:
return 1
print 'RMSE for estimate2: %s' % evaluate(estimate2)
RMSE for estimate2: 0.81514703353
def style_cond(pin_id):
'''items with the same styles'''
pin_styles = items_info.ix[pin_id, 'blogger_style']
same_style_cond = True
for style in pin_styles:
same_style_cond = (same_style_cond) & (fashion_train['blogger_style_' + style] == 1)
return same_style_cond
def estimate3(user_id, pin_id):
'''mean rating of same pin style'''
ratings_by_styles = fashion_train.loc[style_cond(pin_id)]
if ratings_by_styles.empty:
return 1
else:
return ratings_by_styles.rating.mean()
print 'RMSE for estimate3: %s' % evaluate(estimate3)
RMSE for estimate3: 0.825515788148
def estimate4(user_id, pin_id):
'''mean of the items with the same style rated by the user'''
user_condition = fashion_train.user_id == user_id
ratings_by_user_styles = fashion_train.loc[user_condition & style_cond(pin_id)]
if ratings_by_user_styles.empty:
return 1
else:
return ratings_by_user_styles.rating.mean()
print 'RMSE for estimate4: %s' % evaluate(estimate4)
RMSE for estimate4: 0.840272347162
def estimate5(user_id, pin_id):
'''mean rating of same blogger by the user'''
user_condition = fashion_train.user_id == user_id
pin_blogger = items_info.ix[pin_id, 'blog_name']
pin_condition = fashion_train.blog_name == pin_blogger
ratings_by_user_bloggers = fashion_train.loc[user_condition & pin_condition]
if ratings_by_user_bloggers.empty:
return 1
else:
return ratings_by_user_bloggers.rating.mean() #mean value for that blogger from the specific user
print 'RMSE for estimate5: %s' % evaluate(estimate5)
RMSE for estimate5: 0.813904952702