Index
import json
import re
# pinterest_boards contained the gathered urls to the Pinterest boards from the search:
# http://www.pinterest.com/search/boards/?q=nyfw%20fall%202014
filename = "pinterest_boards.json"
with open(filename, "r") as f:
file_boards = json.loads(f.read())
# make sure all the boards belongs to the event. The url have to contain the words: nyfw, fall and 2014
# the boards that doesn't contain the words will be added to a separate dictionary for manually inspection
boards = {}
check_boards = {}
mandatory_words = ['nyfw','fall','2014']
for board in file_boards:
board = board['href'].encode("ascii")
valid_board = True
for word in mandatory_words:
if word not in board:
valid_board = False
break
if valid_board:
boards['_'.join(board.split('/')[3:5])] = board
else:
check_boards['_'.join(board.split('/')[3:5])] = board
print "Number of boards to manually inspect: " + str(len(check_boards))
Number of boards to manually inspect: 84
#After checking the names of the check_boards, I manually review some of them, and those 8 are the ones that belong to other events
del check_boards['fawkeshunter_fallwinter-2013']
del check_boards['csquared224_new-york-paris-fashion-week-spring-2014']
del check_boards['sadeesays_rock-it-%2B-with-2013-14-fallwinter-trend-looks']
del check_boards['sinnstyle_nyfw-fall14-at-sinn']
del check_boards['squarekey_nyfw']
del check_boards['themodeclectic_womens-fashion-vol-i-new-york-nyfw']
del check_boards['tide_tracy-reese-washable-fashion-designs-for-tide-pods']
del check_boards['tracyreeseny_tracy-tide-pods']
#Call Pinterest for get the pins from the selected boards
##You can find the gathered pins under the boards/ directory
'''Create a client object to a mongod localhost instance and create or retrieve a database
Input: Database name we want to connect to
Output: Database connection'''
def get_db(db_name):
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.db_name
return db
'''Insert a pin on the 'pins' collection from the db database
Input: Database name, Pin dictionary'''
def add_pin(db, pin):
if not db.pins.find_one(pin):
db.pins.insert(pin)
'''Integer fields clean up (pins, likes and comments).
Remove \n characters and whitespaces. Transform str to int
Input: Pin object, Name of the field to clean
Output: Integer value'''
def clean_int_field(pin, field):
try:
return int(re.sub('\n','', pin[field]).strip())
except:
return 0
'''Pin cleaning and create the desired structure to store on the collection
Input: Pin dictionary
Output: Pin cleaned dictionary'''
def clean_pin(pin):
pin['board'] = re.sub('\n','', pin['board']).strip()
pin['comments'] = clean_int_field(pin, 'comments')
pin['likes'] = clean_int_field(pin, 'likes')
pin['repins'] = clean_int_field(pin, 'repins')
return pin
not_valid = ['london', 'milan', 'paris', '2013', 'parisfashionweek', 'pfw', 'londonfashionweek', 'lfw', 'milanfashionweek', 'mfw']
'''Verify that the pin doesn't belongs to another runway event
Input: Pin object
Output: boolean'''
def valid_pin(pin):
valid = True
for word in not_valid:
if word in pin['description'].lower():
valid = False
break
return valid
'''Verify if is a valid pin, clean it and add it to the database
Input: Pin object'''
def clean_and_add(db, pins):
for pin in pins:
if valid_pin(pin):
pin = clean_pin(pin)
add_pin(db,pin)
file_boards = !ls "boards/"
print "Number of boards to analyze: " + str(len(file_boards))
Number of boards to analyze: 321
db = get_db('pinterest')
#db.pins.drop() #delete all pins from the collection and the metadata associated
#db.create_collection('pins')
#store in the database all the pins contained on each board json document
for filename in file_boards:
filename = 'boards/' + filename
with open(filename, "r") as f:
pins = json.loads(f.read())
clean_and_add(db, pins)
print "Total number of pins in the Pinterest database: " + str(db.pins.find().count())
Total number of pins in the Pinterest database: 33525
print "Total number of unique pins: " + str(len(db.pins.distinct("pin_page")))
Total number of unique pins: 24314
'''Boards whose pins appeared more times or originally have uploaded more images the same pin can appear in different users
boards but the "pinned from" board should be the same, in that case, the pin will be double count, but it's ok as
we are trying to see which boards are the most influence'''
pipeline = [
{ "$group": { "_id": "$board",
"count": { "$sum": 1 } } },
{ "$sort": { "count" : -1 } }
]
relevant_boards = db.pins.aggregate(pipeline)['result']
first10_boards = relevant_boards[:10]
sum_pins = 0
for board in first10_boards:
sum_pins += board['count']
print "Number of pins contained in the 10 boards with more pins: " + str(sum_pins)
Number of pins contained in the 10 boards with more pins: 26588
26588/33525.0 #The 10 first boards contain the 79% of the total gathered pins
0.7930797912005966
import numpy as np
from matplotlib import pyplot as plt
'''Add text to each graph bars
Input: bars, subplot object, boolean for correct displaying float values
Output: subplot object
'''
def autolabel(rects, ax, round_height):
# attach some text labels
for rect in rects:
height = rect.get_height()
if round_height:
height = round(height, 2)
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, height,
ha='center', va='bottom')
return ax
'''Plot a bar chart
Input: x data, y data, figure number, title, xlabel, ylabel, color, figure size, round value (true if data to display is float)
Output: bar chart plot
'''
def plot_bar_chart(x_data, y_data, num_fig, title='', xlabel='', ylabel='', color='blue', figsize=(8,5), round_values=False):
fig = plt.figure(num_fig, figsize=figsize)
ax = plt.subplot(111)
width = 0.8
barlist = ax.bar(range(len(y_data)), y_data, color=color)
ax = autolabel(barlist, ax, round_values)
ax.set_xticks(np.arange(len(x_data)) + width/2)
ax.set_xticklabels(x_data, rotation=90)
ax.set_title(" " + title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return plt.show()
board_names = [ board['_id'].encode('ascii', 'ignore') for board in first10_boards]
pins_per_board = [ board['count'] for board in first10_boards]
plot_bar_chart(board_names, pins_per_board, 1, title="#Pins per each board", xlabel='', ylabel="#Pins", color='orange')
pipeline = [
{ "$group": { "_id": "$board",
"count": { "$sum": "$likes" } } },
{ "$sort": { "count" : -1 } }
]
liked_boards = db.pins.aggregate(pipeline)['result'][:10]
board_names = [ board['_id'].encode('ascii', 'ignore') for board in liked_boards]
likes_per_board = [ board['count'] for board in liked_boards]
plot_bar_chart(board_names, likes_per_board, 1, title="#Likes per each board", xlabel='', ylabel="#Likes", color='orange')
pipeline = [
{ "$group": { "_id": "$board",
"avg_likes": { "$avg": "$likes" } } },
{ "$sort": { "avg_likes" : -1 } }
]
liked_avg_boards = db.pins.aggregate(pipeline)['result'][:10]
board_names = [ board['_id'].encode('ascii', 'ignore') for board in liked_avg_boards]
avg_likes_per_board = [ board['avg_likes'] for board in liked_avg_boards]
plot_bar_chart(board_names, avg_likes_per_board, 1, title="Avarage of likes per each board", xlabel='', ylabel="Avarage of likes",
color='orange', round_values=True)
#1. Group by unique identifier -> $pin_page
#2. Sum integer fields
#3. Set of boards and description fields
pipeline = [
{ "$group": { "_id": "$pin_page",
"repins": { "$sum": "$repins"},
"description": { "$addToSet": "$description" },
"board": { "$addToSet": "$board" },
"href": { "$addToSet": "$href" },
"likes": { "$sum": "$likes" },
"comments": { "$sum": "$comments" } } },
]
unique_pins_result = db.pins.aggregate(pipeline)
#insert the result in the new collection unique_pins
db.unique_pins.insert(unique_pins_result['result']);
#Note: the boards where the pins appear at, can be different from the ones that I've gathered (I only looked for boards of NYFW 2014
#fall collection, but then people could add them to their own boards, and for instance the total number of repins, likes or comments
#may be different
more_pins = db.unique_pins.find().sort([("repins", -1)])[:10]
print 'board | pins | likes | pin_page'
for pin in more_pins:
print ('').join(pin['board']) + ' ' + str(pin['repins']) + ' ' + str(pin['likes']) + ' ' + pin['_id']
board | pins | likes | pin_page jcrew.com 342 47 http://media-cache-ec0.pinimg.com/236x/5c/93/29/5c932937622c02ee595f6c61835b4376.jpg jcrew.com 285 30 http://media-cache-ec0.pinimg.com/236x/f8/c7/80/f8c780815970c476f3217185f0415921.jpg jcrew.com 273 50 http://media-cache-ec0.pinimg.com/236x/07/8b/08/078b08f37232c4c82eeb12b42e66510d.jpg jcrew.com 262 42 http://media-cache-ak0.pinimg.com/236x/f5/7f/dc/f57fdc2f599d94adad3cf7d6d5dbac88.jpg jcrew.com 230 52 http://media-cache-ec0.pinimg.com/236x/22/b3/20/22b3207eb5c6d14c60455af121a05c9b.jpg jcrew.com 227 38 http://media-cache-ak0.pinimg.com/236x/15/61/25/1561252ace110826afec63b18ebafb54.jpg jcrew.com 213 17 http://media-cache-ec0.pinimg.com/236x/c9/a8/2e/c9a82e70682616bcd5af830fa65091bb.jpg jcrew.com 205 31 http://media-cache-ak0.pinimg.com/236x/ed/70/5a/ed705a7fb9d8a388b007ea571b516bfc.jpg jcrew.com 201 25 http://media-cache-ec0.pinimg.com/236x/73/43/45/734345b388d1222e4ab6366952adccee.jpg jcrew.com 194 31 http://media-cache-ak0.pinimg.com/236x/14/d2/c2/14d2c22449c7e3a56a98d1e5060b9111.jpg
** The 10 pins with more repins, all of them belongs to the JCrew board:
http://www.pinterest.com/jcrew/nyfw-fallwinter-2014/
more_likes = db.unique_pins.find().sort([("likes", -1)])[:10]
print 'board | likes | pins | href'
for pin in more_likes:
print ('').join(pin['board']) + ' ' + str(pin['likes']) + ' ' + str(pin['repins']) + ' ' + pin['href'][0]
board | likes | pins | href jcrew.com 52 230 http://www.pinterest.com/pin/224335625162998612/ jcrew.com 50 273 http://www.pinterest.com/pin/224335625162995128/ jcrew.com 47 342 http://www.pinterest.com/pin/224335625163000284/ jcrew.com 42 262 http://www.pinterest.com/pin/224335625162998702/ jcrew.com 40 158 http://www.pinterest.com/pin/224335625162995271/ jcrew.com 38 227 http://www.pinterest.com/pin/224335625162998334/ jcrew.com 32 93 http://www.pinterest.com/pin/224335625163000981/ jcrew.com 32 155 http://www.pinterest.com/pin/224335625162997500/ jcrew.com 31 194 http://www.pinterest.com/pin/224335625162999790/ stylecaster.com 31 39 http://www.pinterest.com/pin/163888873914182159/
# Using map reduce approach for counting the number of different hashtags
# map-reduce functions are written in JS
from bson.code import Code
map = Code("function(){"
"var description = this.description;"
"if (description) {"
"description = ( description.join(' ')).toLowerCase().split(' ');"
"for (var i = 0; i < description.length; i++) {"
"if (description[i]) {"
"if (/^#/.test(description[i])) {"
"emit(description[i], 1);"
"}"
"}"
"}"
"}"
"};")
reduce = Code("function( key, values ) {"
"var count = 0;"
"values.forEach(function(v) {"
"count += v;"
"});"
"return count;"
"}")
db.unique_pins.map_reduce(map, reduce, "word_count") # map_reduce(map, reduce, output collection)
Collection(Database(MongoClient('localhost', 27017), u'db_name'), u'word_count')
hashtags = db.word_count.find().sort([("value", -1)])[:10]
From this hashtag analysis is where I get the idea of looking for the Street style images
for hashtag in hashtags:
print hashtag
{u'_id': u'#nyfw', u'value': 2850.0} {u'_id': u'#fall', u'value': 222.0} {u'_id': u'#fashion', u'value': 217.0} {u'_id': u'#mbfw', u'value': 199.0} {u'_id': u'#winter', u'value': 198.0} {u'_id': u'#fall2014', u'value': 122.0} {u'_id': u'#streetstyle', u'value': 95.0} {u'_id': u'#fw14', u'value': 91.0} {u'_id': u'#roundup', u'value': 70.0} {u'_id': u'#style', u'value': 66.0}
# First we have to get a list of the designers on the NYFW
# I've used the BeautifulSoup library for looking into the html content
from bs4 import BeautifulSoup
import requests
url = 'http://www.style.com/fashionshows/F2014RTW/NYC' #url with a list of designers
'''Perform the get request
Input: URL string
Output: html gathered document'''
def get_html(url):
r = requests.get(url)
return r.text
'''Generate a list with the designer names from the html content
Input: html text
Output: list'''
def extract_designers(page):
designers = []
soup = BeautifulSoup(page)
designers_list = soup.find(id="alphabetical_list")
for designer in designers_list.select("span"):
designers.append(designer.get_text())
return designers
designers = extract_designers(get_html(url))
designers = [designer.encode('utf-8') for designer in designers]
#some clean up due to encoding problems
designers[7] = 'A Detacher'
designers[41] = 'Cut25 by Yigal Azrouel'
designers[61] = 'Herve Leger by Max Azria'
designers[154] = 'See by Chloe'
designers[193] = 'Ygal Azrouel'
'''Find the designer in the pin description. In order to find more matches, I've created 3 variants of the same word
example: combined = '(Carolina Herrera)|(CarolinaHerrera)|(Carolina-Herrera)'
Create a dictionary with key=designer, value=number of appearances
'''
designers_count = {}
for designer in designers:
aux = [designer, designer.replace(' ',''), designer.replace(' ','-')] #try to find different possible names for the same designer
combined = "(" + ")|(".join(aux) + ")"
designers_count[designer] = db.unique_pins.find({'description' : {"$regex": combined, "$options": 'i' } }).count() #ignorecase
sorted(designers_count.items(), key=lambda counts: counts[1], reverse=True)[:10]
[(u'Co', 10626), (u'Marc Jacobs', 496), (u'Ralph Lauren', 481), (u'Alexander Wang', 397), (u'Oscar de la Renta', 362), (u'Prabal Gurung', 332), (u'Michael Kors', 313), (u'Jason Wu', 307), (u'Carolina Herrera', 259), (u'Vera Wang', 255)]
street_style = '(streetstyle) | (street-style) | (street style)'
street_count = db.unique_pins.find({'description' : {"$regex": street_style, "$options": 'i' } }).count()
print 'Number of unique pins about Street Style: ' + str(street_count)
Number of unique pins about Street Style: 631
street = db.unique_pins.find({'description' : {"$regex": street_style, "$options": 'i' } }).sort([("repins", -1)])[:10]
print 'repins | board | pin_page'
for pin in street:
print str(pin['repins']) + ' ' + ('').join(pin['board']) + ' ' + pin['href'][0]
repins | board | pin_page 42 racked.com http://www.pinterest.com/pin/163888873914141623/ 38 stylecaster.com http://www.pinterest.com/pin/163888873914157749/ 26 racked.com http://www.pinterest.com/pin/163888873914141607/ 26 ellecanada.com http://www.pinterest.com/pin/135178426289605486/ 22 stylecaster.com http://www.pinterest.com/pin/163888873914141508/ 21 stylecaster.com http://www.pinterest.com/pin/163888873914151877/ 18 stylecaster.com http://www.pinterest.com/pin/163888873914151871/ 15 stylecaster.com http://www.pinterest.com/pin/163888873914169406/ 15 ellecanada.com http://www.pinterest.com/pin/135178426289605476/ 14 ellecanada.com http://www.pinterest.com/pin/135178426289581663/