The explanation of this implementation can be found at: http://www.rosariomgomez.me/
Index
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from create_features import create_user_features
def get_db():
from pymongo import MongoClient
client = MongoClient('server', port) #server, port
db = client.database_name #database name
db.authenticate("user", "pwd")
return db
db = get_db()
#retrieve all users from the DB and build the feature vector
all_users = db.user.find()
list_users = [create_user_features(user) for user in all_users]
#build users' pandas dataframe
users = pd.DataFrame(list_users)
users = users.rename(columns = {'_id':'user_id'}) #to be in line with the ratings names
users.head()
user_id | age | country | day_off | dress_size | fashionista | like_styles_pref | like_styles_pref_bohemian chic | like_styles_pref_casual chic | like_styles_pref_classic | like_styles_pref_edgy | like_styles_pref_preppy | like_styles_pref_romantic | nolike_styles_pref | nolike_styles_pref_bohemian chic | nolike_styles_pref_casual chic | nolike_styles_pref_classic | nolike_styles_pref_edgy | nolike_styles_pref_preppy | nolike_styles_pref_romantic | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53962dfa3191490008a690df | 55 | US | sport | 10 | nolike | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [] | 0 | 0 | 0 | 0 | 0 | 0 | ... |
1 | 53968e993d4e0c0007a2546f | 50 | US | family | 6 | nolike | [] | 0 | 0 | 0 | 0 | 0 | 0 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
2 | 53971683b7d85a0008b1bbe2 | 30 | ES | family | 8 | nolike | [romantic, casual chic] | 0 | 1 | 0 | 0 | 0 | 1 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
3 | 539851a17f6ba70007ba8bdb | 30 | ES | party | 10 | nolike | [romantic, casual chic, preppy] | 0 | 1 | 0 | 0 | 1 | 1 | [edgy] | 0 | 0 | 0 | 1 | 0 | 0 | ... |
4 | 539770c4a9a4570008c28a9b | 45 | ES | family | 10 | ok | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [bohemian chic, edgy] | 1 | 0 | 0 | 1 | 0 | 0 | ... |
5 rows × 27 columns
#retrieve all ratings from the DB
rated_outfits = db.ratings.find()
list_ratings = [rate for rate in rated_outfits]
#ratings dataframe
cols = ['user_id', 'pin_id', 'rating']
ratings = pd.DataFrame(list_ratings, columns=cols)
ratings.head()
user_id | pin_id | rating | |
---|---|---|---|
0 | 538677f561e01f0be9e838f7 | 537933bb61e01f10f111886f | 0 |
1 | 538677f561e01f0be9e838f7 | 537933a161e01f10f11187e6 | 0 |
2 | 538677f561e01f0be9e838f7 | 53793d6d61e01f10f111a725 | 2 |
3 | 538677f561e01f0be9e838f7 | 537933be61e01f10f111887f | 2 |
4 | 538677f561e01f0be9e838f7 | 53793d8861e01f10f111a741 | 0 |
5 rows × 3 columns
#merge the users and ratings dataframes
fashion_users = pd.merge(ratings, users)
fashion_users.head()
user_id | pin_id | rating | age | country | day_off | dress_size | fashionista | like_styles_pref | like_styles_pref_bohemian chic | like_styles_pref_casual chic | like_styles_pref_classic | like_styles_pref_edgy | like_styles_pref_preppy | like_styles_pref_romantic | nolike_styles_pref | nolike_styles_pref_bohemian chic | nolike_styles_pref_casual chic | nolike_styles_pref_classic | nolike_styles_pref_edgy | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 538677f561e01f0be9e838f7 | 537933bb61e01f10f111886f | 0 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
1 | 538677f561e01f0be9e838f7 | 537933a161e01f10f11187e6 | 0 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
2 | 538677f561e01f0be9e838f7 | 53793d6d61e01f10f111a725 | 2 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
3 | 538677f561e01f0be9e838f7 | 537933be61e01f10f111887f | 2 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
4 | 538677f561e01f0be9e838f7 | 53793d8861e01f10f111a741 | 0 | 30 | US | sport | 8 | love | [classic, casual chic, preppy] | 0 | 1 | 1 | 0 | 1 | 0 | [edgy] | 0 | 0 | 0 | 1 | ... |
5 rows × 29 columns
** Rating values can be: 0 (don't like), 1 (ok) or 2 (like)
aux_pt = fashion_users.pivot_table('rating', rows='pin_id', cols='user_id')
matrix = aux_pt.ix[410:445, :]
fig = plt.figure(1, figsize=(8,8))
plt.imshow(matrix, interpolation='nearest')
plt.ylabel("Pin id")
plt.xlabel("User id")
plt.colorbar()
plt.show()
plt.show()
#fig.savefig('matrix.png', dpi = (200))
#ratings by users based on their age group (seems to be a tendency to lower rates amongst the central group ages)
pt_age = fashion_users.pivot_table(values='rating', rows=['age'], aggfunc=[np.mean])
pt_age.plot(kind='bar', color='orange')
plt.title('Mean rating by age group')
plt.show()
#ratings by users based on their professions
fashion_users.pivot_table(values='rating', rows=['profession'], aggfunc=[np.size, np.mean, np.std])
size | mean | std | |
---|---|---|---|
profession | |||
art | 104 | 0.884615 | 0.832050 |
business | 548 | 1.003650 | 0.836433 |
other | 802 | 1.019950 | 0.862903 |
science | 3006 | 1.021956 | 0.865170 |
social | 112 | 0.482143 | 0.571794 |
5 rows × 3 columns
#the group of users who like shopping is the one who voted the large number of outfits
fashion_users.pivot_table(values='rating', rows=['day_off'], aggfunc=[np.size, np.mean, np.std])
size | mean | std | |
---|---|---|---|
day_off | |||
family | 604 | 0.890728 | 0.826058 |
party | 1234 | 1.009724 | 0.850841 |
read | 132 | 1.106061 | 0.896640 |
shop | 1666 | 1.094838 | 0.862903 |
sport | 936 | 0.888889 | 0.856043 |
5 rows × 3 columns
#the group of users who has "love" or "ok" values to fashionista gave greater ratings than the ones who doesn't like fashion
fashion_users.pivot_table(values='rating', rows=['fashionista'], aggfunc=[np.size, np.mean, np.std])
size | mean | std | |
---|---|---|---|
fashionista | |||
love | 1646 | 1.002430 | 0.865671 |
nolike | 414 | 0.850242 | 0.819535 |
ok | 2512 | 1.028662 | 0.857574 |
3 rows × 3 columns
pt_fashionistas = fashion_users.pivot_table(values='rating', rows=['fashionista'], aggfunc=[np.mean])
pt_fashionistas.plot(kind='bar', color='orange')
plt.title('Mean ratings by fashionistas classification')
plt.show()
#higher rates from Spanish friends than US
fashion_users.pivot_table(values='rating', rows=['age', 'country'], aggfunc=[np.size, np.mean, np.std])
size | mean | std | ||
---|---|---|---|---|
age | country | |||
20 | ES | 24 | 1.166667 | 0.816497 |
US | 176 | 0.994318 | 0.851870 | |
25 | ES | 506 | 1.229249 | 0.880377 |
US | 27 | 0.740741 | 0.813000 | |
30 | ES | 286 | 1.101399 | 0.842010 |
SE | 22 | 1.363636 | 0.847711 | |
US | 476 | 0.848739 | 0.839936 | |
35 | ES | 193 | 0.772021 | 0.809993 |
US | 151 | 0.854305 | 0.778009 | |
40 | IE | 288 | 0.913194 | 0.828141 |
US | 21 | 0.952381 | 0.920662 | |
45 | ES | 25 | 0.920000 | 0.812404 |
50 | US | 4 | 1.250000 | 0.957427 |
55 | ES | 83 | 1.301205 | 0.851612 |
US | 4 | 0.500000 | 0.577350 |
15 rows × 3 columns
pt_country_age = fashion_users.pivot_table(values='rating', rows=['age', 'country'], aggfunc=[np.mean])
pt_country_age.plot(kind='bar', color='orange')
plt.title('Mean ratings by age and country')
plt.show()
** Styles go from 0 (hate the style) to 1 (love the style)
styles = np.mean(users.ix[:,('user_style_casual chic', 'user_style_preppy', 'user_style_classic',
'user_style_romantic', 'user_style_bohemian chic','user_style_edgy')])
fig = plt.figure(1, figsize=(8,5))
ax = plt.subplot(111)
styles.plot(kind='bar', color='orange')
ax.set_xticklabels(['casual chic', 'preppy', 'classic', 'romantic', 'bohemian chic', 'edgy'], rotation=90)
plt.title('Mean of liked styles by users')
#plt.tight_layout()
plt.show()
#fig.savefig('styles_mean.png', dpi = (300))