import json import re # pinterest_boards contained the gathered urls to the Pinterest boards from the search: # http://www.pinterest.com/search/boards/?q=nyfw%20fall%202014 filename = "pinterest_boards.json" with open(filename, "r") as f: file_boards = json.loads(f.read()) # make sure all the boards belongs to the event. The url have to contain the words: nyfw, fall and 2014 # the boards that doesn't contain the words will be added to a separate dictionary for manually inspection boards = {} check_boards = {} mandatory_words = ['nyfw','fall','2014'] for board in file_boards: board = board['href'].encode("ascii") valid_board = True for word in mandatory_words: if word not in board: valid_board = False break if valid_board: boards['_'.join(board.split('/')[3:5])] = board else: check_boards['_'.join(board.split('/')[3:5])] = board print "Number of boards to manually inspect: " + str(len(check_boards)) #After checking the names of the check_boards, I manually review some of them, and those 8 are the ones that belong to other events del check_boards['fawkeshunter_fallwinter-2013'] del check_boards['csquared224_new-york-paris-fashion-week-spring-2014'] del check_boards['sadeesays_rock-it-%2B-with-2013-14-fallwinter-trend-looks'] del check_boards['sinnstyle_nyfw-fall14-at-sinn'] del check_boards['squarekey_nyfw'] del check_boards['themodeclectic_womens-fashion-vol-i-new-york-nyfw'] del check_boards['tide_tracy-reese-washable-fashion-designs-for-tide-pods'] del check_boards['tracyreeseny_tracy-tide-pods'] #Call Pinterest for get the pins from the selected boards ##You can find the gathered pins under the boards/ directory '''Create a client object to a mongod localhost instance and create or retrieve a database Input: Database name we want to connect to Output: Database connection''' def get_db(db_name): from pymongo import MongoClient client = MongoClient('localhost:27017') db = client.db_name return db '''Insert a pin on the 'pins' collection from the db database Input: Database name, Pin dictionary''' def add_pin(db, pin): if not db.pins.find_one(pin): db.pins.insert(pin) '''Integer fields clean up (pins, likes and comments). Remove \n characters and whitespaces. Transform str to int Input: Pin object, Name of the field to clean Output: Integer value''' def clean_int_field(pin, field): try: return int(re.sub('\n','', pin[field]).strip()) except: return 0 '''Pin cleaning and create the desired structure to store on the collection Input: Pin dictionary Output: Pin cleaned dictionary''' def clean_pin(pin): pin['board'] = re.sub('\n','', pin['board']).strip() pin['comments'] = clean_int_field(pin, 'comments') pin['likes'] = clean_int_field(pin, 'likes') pin['repins'] = clean_int_field(pin, 'repins') return pin not_valid = ['london', 'milan', 'paris', '2013', 'parisfashionweek', 'pfw', 'londonfashionweek', 'lfw', 'milanfashionweek', 'mfw'] '''Verify that the pin doesn't belongs to another runway event Input: Pin object Output: boolean''' def valid_pin(pin): valid = True for word in not_valid: if word in pin['description'].lower(): valid = False break return valid '''Verify if is a valid pin, clean it and add it to the database Input: Pin object''' def clean_and_add(db, pins): for pin in pins: if valid_pin(pin): pin = clean_pin(pin) add_pin(db,pin) file_boards = !ls "boards/" print "Number of boards to analyze: " + str(len(file_boards)) db = get_db('pinterest') #db.pins.drop() #delete all pins from the collection and the metadata associated #db.create_collection('pins') #store in the database all the pins contained on each board json document for filename in file_boards: filename = 'boards/' + filename with open(filename, "r") as f: pins = json.loads(f.read()) clean_and_add(db, pins) print "Total number of pins in the Pinterest database: " + str(db.pins.find().count()) print "Total number of unique pins: " + str(len(db.pins.distinct("pin_page"))) '''Boards whose pins appeared more times or originally have uploaded more images the same pin can appear in different users boards but the "pinned from" board should be the same, in that case, the pin will be double count, but it's ok as we are trying to see which boards are the most influence''' pipeline = [ { "$group": { "_id": "$board", "count": { "$sum": 1 } } }, { "$sort": { "count" : -1 } } ] relevant_boards = db.pins.aggregate(pipeline)['result'] first10_boards = relevant_boards[:10] sum_pins = 0 for board in first10_boards: sum_pins += board['count'] print "Number of pins contained in the 10 boards with more pins: " + str(sum_pins) 26588/33525.0 #The 10 first boards contain the 79% of the total gathered pins import numpy as np from matplotlib import pyplot as plt '''Add text to each graph bars Input: bars, subplot object, boolean for correct displaying float values Output: subplot object ''' def autolabel(rects, ax, round_height): # attach some text labels for rect in rects: height = rect.get_height() if round_height: height = round(height, 2) ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, height, ha='center', va='bottom') return ax '''Plot a bar chart Input: x data, y data, figure number, title, xlabel, ylabel, color, figure size, round value (true if data to display is float) Output: bar chart plot ''' def plot_bar_chart(x_data, y_data, num_fig, title='', xlabel='', ylabel='', color='blue', figsize=(8,5), round_values=False): fig = plt.figure(num_fig, figsize=figsize) ax = plt.subplot(111) width = 0.8 barlist = ax.bar(range(len(y_data)), y_data, color=color) ax = autolabel(barlist, ax, round_values) ax.set_xticks(np.arange(len(x_data)) + width/2) ax.set_xticklabels(x_data, rotation=90) ax.set_title(" " + title) plt.xlabel(xlabel) plt.ylabel(ylabel) return plt.show() board_names = [ board['_id'].encode('ascii', 'ignore') for board in first10_boards] pins_per_board = [ board['count'] for board in first10_boards] plot_bar_chart(board_names, pins_per_board, 1, title="#Pins per each board", xlabel='', ylabel="#Pins", color='orange') pipeline = [ { "$group": { "_id": "$board", "count": { "$sum": "$likes" } } }, { "$sort": { "count" : -1 } } ] liked_boards = db.pins.aggregate(pipeline)['result'][:10] board_names = [ board['_id'].encode('ascii', 'ignore') for board in liked_boards] likes_per_board = [ board['count'] for board in liked_boards] plot_bar_chart(board_names, likes_per_board, 1, title="#Likes per each board", xlabel='', ylabel="#Likes", color='orange') pipeline = [ { "$group": { "_id": "$board", "avg_likes": { "$avg": "$likes" } } }, { "$sort": { "avg_likes" : -1 } } ] liked_avg_boards = db.pins.aggregate(pipeline)['result'][:10] board_names = [ board['_id'].encode('ascii', 'ignore') for board in liked_avg_boards] avg_likes_per_board = [ board['avg_likes'] for board in liked_avg_boards] plot_bar_chart(board_names, avg_likes_per_board, 1, title="Avarage of likes per each board", xlabel='', ylabel="Avarage of likes", color='orange', round_values=True) #1. Group by unique identifier -> $pin_page #2. Sum integer fields #3. Set of boards and description fields pipeline = [ { "$group": { "_id": "$pin_page", "repins": { "$sum": "$repins"}, "description": { "$addToSet": "$description" }, "board": { "$addToSet": "$board" }, "href": { "$addToSet": "$href" }, "likes": { "$sum": "$likes" }, "comments": { "$sum": "$comments" } } }, ] unique_pins_result = db.pins.aggregate(pipeline) #insert the result in the new collection unique_pins db.unique_pins.insert(unique_pins_result['result']); #Note: the boards where the pins appear at, can be different from the ones that I've gathered (I only looked for boards of NYFW 2014 #fall collection, but then people could add them to their own boards, and for instance the total number of repins, likes or comments #may be different more_pins = db.unique_pins.find().sort([("repins", -1)])[:10] print 'board | pins | likes | pin_page' for pin in more_pins: print ('').join(pin['board']) + ' ' + str(pin['repins']) + ' ' + str(pin['likes']) + ' ' + pin['_id'] more_likes = db.unique_pins.find().sort([("likes", -1)])[:10] print 'board | likes | pins | href' for pin in more_likes: print ('').join(pin['board']) + ' ' + str(pin['likes']) + ' ' + str(pin['repins']) + ' ' + pin['href'][0] # Using map reduce approach for counting the number of different hashtags # map-reduce functions are written in JS from bson.code import Code map = Code("function(){" "var description = this.description;" "if (description) {" "description = ( description.join(' ')).toLowerCase().split(' ');" "for (var i = 0; i < description.length; i++) {" "if (description[i]) {" "if (/^#/.test(description[i])) {" "emit(description[i], 1);" "}" "}" "}" "}" "};") reduce = Code("function( key, values ) {" "var count = 0;" "values.forEach(function(v) {" "count += v;" "});" "return count;" "}") db.unique_pins.map_reduce(map, reduce, "word_count") # map_reduce(map, reduce, output collection) hashtags = db.word_count.find().sort([("value", -1)])[:10] for hashtag in hashtags: print hashtag # First we have to get a list of the designers on the NYFW # I've used the BeautifulSoup library for looking into the html content from bs4 import BeautifulSoup import requests url = 'http://www.style.com/fashionshows/F2014RTW/NYC' #url with a list of designers '''Perform the get request Input: URL string Output: html gathered document''' def get_html(url): r = requests.get(url) return r.text '''Generate a list with the designer names from the html content Input: html text Output: list''' def extract_designers(page): designers = [] soup = BeautifulSoup(page) designers_list = soup.find(id="alphabetical_list") for designer in designers_list.select("span"): designers.append(designer.get_text()) return designers designers = extract_designers(get_html(url)) designers = [designer.encode('utf-8') for designer in designers] #some clean up due to encoding problems designers[7] = 'A Detacher' designers[41] = 'Cut25 by Yigal Azrouel' designers[61] = 'Herve Leger by Max Azria' designers[154] = 'See by Chloe' designers[193] = 'Ygal Azrouel' '''Find the designer in the pin description. In order to find more matches, I've created 3 variants of the same word example: combined = '(Carolina Herrera)|(CarolinaHerrera)|(Carolina-Herrera)' Create a dictionary with key=designer, value=number of appearances ''' designers_count = {} for designer in designers: aux = [designer, designer.replace(' ',''), designer.replace(' ','-')] #try to find different possible names for the same designer combined = "(" + ")|(".join(aux) + ")" designers_count[designer] = db.unique_pins.find({'description' : {"$regex": combined, "$options": 'i' } }).count() #ignorecase sorted(designers_count.items(), key=lambda counts: counts[1], reverse=True)[:10] street_style = '(streetstyle) | (street-style) | (street style)' street_count = db.unique_pins.find({'description' : {"$regex": street_style, "$options": 'i' } }).count() print 'Number of unique pins about Street Style: ' + str(street_count) street = db.unique_pins.find({'description' : {"$regex": street_style, "$options": 'i' } }).sort([("repins", -1)])[:10] print 'repins | board | pin_page' for pin in street: print str(pin['repins']) + ' ' + ('').join(pin['board']) + ' ' + pin['href'][0]