Building a fashion recommender (III): Content based recommender¶

The explanation of this implementation can be found at: http://www.rosariomgomez.me/

Index

Build the training and testing sets
Estimation functions
Content based recommendations

1. Build the training and testing sets¶

1.1. Build the dataframes¶

In [1]:

import numpy as np
import pandas as pd
from create_features import create_pin_features, create_user_features

In [2]:

def get_db():
    from pymongo import MongoClient
    client = MongoClient('server', port) #server, port
    db = client.database_name #database name
    db.authenticate("user", "pwd")
    return db

In [3]:

db = get_db()

In [4]:

#retrieve all the ratings from the DB
rated_outfits = db.ratings.find()
list_ratings = [rate for rate in rated_outfits]

In [5]:

#retrieve all the users from the DB and build the feature vectors
all_users = db.user.find()
list_users = [create_user_features(user) for user in all_users]

In [6]:

#retrieve all items from the DB and build the feature vectors
all_pins = db.fullpin.find()
list_pins = [create_pin_features(pin) for pin in all_pins]

In [7]:

#build the pandas items dataframe
items = pd.DataFrame(list_pins)
items = items.rename(columns = {'_id':'pin_id'}) #to be in line with the ratings names
items.head()

Out[7]:

	pin_id	blog_name	blogger_age	blogger_body_shape_hourglass	blogger_dress_size	blogger_style	blogger_style_classic	blogger_style_romantic
0	537934c861e01f10f1118dea	Hallie Daily	40	1	6	[classic, romantic]	1	1	...
1	537934d261e01f10f1118e1b	Hallie Daily	40	1	6	[classic]	1	0	...
2	537934cc61e01f10f1118e00	Hallie Daily	40	1	6	[classic]	1	0	...
3	537934c661e01f10f1118ddf	Hallie Daily	40	1	6	[classic, romantic]	1	1	...
4	537934c661e01f10f1118ddd	Hallie Daily	40	1	6	[classic]	1	0	...

5 rows × 319 columns

In [8]:

#build the ratings dataframe
cols = ['user_id', 'pin_id', 'rating']
ratings = pd.DataFrame(list_ratings, columns=cols)
ratings.head()

Out[8]:

	user_id	pin_id	rating
0	538677f561e01f0be9e838f7	537933bb61e01f10f111886f	0
1	538677f561e01f0be9e838f7	537933a161e01f10f11187e6	0
2	538677f561e01f0be9e838f7	53793d6d61e01f10f111a725	2
3	538677f561e01f0be9e838f7	537933be61e01f10f111887f	2
4	538677f561e01f0be9e838f7	53793d8861e01f10f111a741	0

5 rows × 3 columns

In [9]:

#build the pandas users dataframe
users = pd.DataFrame(list_users)
users = users.rename(columns = {'_id':'user_id'}) #to be in line with the ratings names
users.head()

Out[9]:

	user_id	age	country	day_off	dress_size	fashionista	like_styles_pref	like_styles_pref_casual chic	like_styles_pref_classic	like_styles_pref_preppy	like_styles_pref_romantic	nolike_styles_pref	nolike_styles_pref_bohemian chic	nolike_styles_pref_edgy
0	53962dfa3191490008a690df	55	US	sport	10	nolike	[classic, casual chic, preppy]	1	1	1	0	[]	0	0	...
1	53968e993d4e0c0007a2546f	50	US	family	6	nolike	[]	0	0	0	0	[edgy]	0	1	...
2	53971683b7d85a0008b1bbe2	30	ES	family	8	nolike	[romantic, casual chic]	1	0	0	1	[edgy]	0	1	...
3	539851a17f6ba70007ba8bdb	30	ES	party	10	nolike	[romantic, casual chic, preppy]	1	0	1	1	[edgy]	0	1	...
4	539770c4a9a4570008c28a9b	45	ES	family	10	ok	[classic, casual chic, preppy]	1	1	1	0	[bohemian chic, edgy]	1	1	...

5 rows × 27 columns

In [10]:

#merge ratings, items and users dataframes
fashion = pd.merge(pd.merge(ratings, users), items)
fashion.head()

Out[10]:

	user_id	pin_id	rating	age	country	day_off	dress_size	fashionista	like_styles_pref	like_styles_pref_casual chic	like_styles_pref_classic	like_styles_pref_preppy	nolike_styles_pref	nolike_styles_pref_bohemian chic	nolike_styles_pref_edgy
0	538677f561e01f0be9e838f7	537933bb61e01f10f111886f	0	30	US	sport	8	love	[classic, casual chic, preppy]	1	1	1	[edgy]	0	1	...
1	539604e40aa8e20007a976fb	537933bb61e01f10f111886f	2	30	ES	read	8	ok	[casual chic]	1	0	0	[]	0	0	...
2	539616880aa8e20008b99e19	537933bb61e01f10f111886f	2	25	ES	family	6	ok	[casual chic, preppy]	1	0	1	[edgy]	0	1	...
3	539628df3191490008a690d5	537933bb61e01f10f111886f	2	40	US	sport	10	ok	[classic]	0	1	0	[edgy]	0	1	...
4	5396d3a6989960000db79381	537933bb61e01f10f111886f	1	30	ES	party	10	nolike	[classic, casual chic]	1	1	0	[bohemian chic, edgy]	1	1	...

5 rows × 347 columns

1.2. Create the training and testing sets¶

In [11]:

#20% of each user data for testing
def assign_to_set(df):
    '''Randomly select 20% of indices from the dataframe and set the for_testing column to True
    Input: dataframe
    Output: dataframe'''
    np.random.seed(1)
    sampled_ids = np.random.choice(df.index, size=np.int64(np.ceil(df.index.size * 0.2)), replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

fashion['for_testing'] = False
grouped = fashion.groupby('user_id', group_keys=False).apply(assign_to_set)
fashion_train = fashion[grouped.for_testing == False]
fashion_test = fashion[grouped.for_testing == True]
print fashion.shape
print fashion_train.shape
print fashion_test.shape
assert len(fashion_train.index & fashion_test.index) == 0  #ensure we don't have the same values on both sets

(2245, 348)
(1783, 348)
(462, 348)

2. Estimation functions¶

In [12]:

#using RMSE as performance criterion
def compute_rmse(y_pred, y_true):
    '''Calculate the root mean square value between the predicted and true rating
    Input: predicted rating, true rating
    Output: RMSE'''
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [13]:

def evaluate(estimate_f):
    '''Calculate the RMSE to the passed as parameter recommendation function
    Input: function to predict the rating of a tuple (user, item)
    Output: (float) RMSE'''
    ids_to_estimate = zip(fashion_test.user_id, fashion_test.pin_id) #list of tuples (user_id, pin_id)
    estimated = np.array([estimate_f(u,p) for u,p in ids_to_estimate]) #apply the passed estimate function to the user,pin tuple
    real = fashion_test.rating.values
    return compute_rmse(estimated, real)

3. Content based recommendation engine¶

3.1. Mean items rating by user¶

In [14]:

def estimate1(user_id, pin_id):
    '''mean of user ratings'''
    user_condition = fashion_train.user_id == user_id
    return fashion_train.loc[user_condition, 'rating'].mean()

print 'RMSE for estimate1: %s' % evaluate(estimate1)

RMSE for estimate1: 0.834525165107

3.2. Mean items rating grouped by blogger¶

In [15]:

#new items pivot table where the index is the item_id for getting the user item blogger
items_info = items.set_index('pin_id')
means_by_blogger = fashion_train.pivot_table(values='rating', rows='pin_id', cols='blog_name')

In [16]:

def estimate2(user_id, pin_id):
    '''mean rating of same blogger'''
    pin_blogger = items_info.ix[pin_id, 'blog_name']

    if pin_blogger in means_by_blogger.columns:
        return means_by_blogger.ix[:, pin_blogger].mean() #mean value for that blogger
    else:
        return 1
    
print 'RMSE for estimate2: %s' % evaluate(estimate2)

RMSE for estimate2: 0.81514703353

3.3. Mean items rating by item style¶

In [17]:

def style_cond(pin_id):
    '''items with the same styles'''
    pin_styles = items_info.ix[pin_id, 'blogger_style']
    same_style_cond = True
    for style in pin_styles:
        same_style_cond = (same_style_cond) & (fashion_train['blogger_style_' + style] == 1)
    return same_style_cond

In [18]:

def estimate3(user_id, pin_id):
    '''mean rating of same pin style'''
    
    ratings_by_styles = fashion_train.loc[style_cond(pin_id)]
    
    if ratings_by_styles.empty:
        return 1
    else:
        return ratings_by_styles.rating.mean()
    
print 'RMSE for estimate3: %s' % evaluate(estimate3)

RMSE for estimate3: 0.825515788148

3.4. Mean items rating by item style and user¶

In [20]:

def estimate4(user_id, pin_id):
    '''mean of the items with the same style rated by the user'''
    
    user_condition = fashion_train.user_id == user_id
    ratings_by_user_styles = fashion_train.loc[user_condition & style_cond(pin_id)]
    
    if ratings_by_user_styles.empty:
        return 1
    else:
        return ratings_by_user_styles.rating.mean()
    
print 'RMSE for estimate4: %s' % evaluate(estimate4)

RMSE for estimate4: 0.840272347162

3.5. Mean items rating by blogger and user¶

In [23]:

def estimate5(user_id, pin_id):
    '''mean rating of same blogger by the user'''
    user_condition = fashion_train.user_id == user_id
    pin_blogger = items_info.ix[pin_id, 'blog_name']
    pin_condition = fashion_train.blog_name == pin_blogger
    ratings_by_user_bloggers = fashion_train.loc[user_condition & pin_condition]

    if ratings_by_user_bloggers.empty:
        return 1
    else:
        return ratings_by_user_bloggers.rating.mean() #mean value for that blogger from the specific user
    
print 'RMSE for estimate5: %s' % evaluate(estimate5)

RMSE for estimate5: 0.813904952702