#!/usr/bin/env python # coding: utf-8 # In[1]: from bs4 import BeautifulSoup import requests import re import pandas as pd import csv import matplotlib.pyplot as plt import seaborn as sns from geopy.geocoders import Nominatim import string import numpy as np import scipy as sp from scipy import stats get_ipython().run_line_magic('pylab', 'inline') # # Pre-Steps:# # # ###Step 1: # Create a profile with only the basic and generic information. The identity setting is to distinguish it from the commonly seen 20-somethings US profiles. So we can learn later on wether the system is giving results just based on my preferences and search criteria, or it's still customizing the results based on my profile. # # * 46 year old man in China (Age, gender and location are requried for registrition) # I'm looking for: # * Everyone (not gender specific) # * Ages 18-99 # * Located anywhere # * For new friends, short-term dating, long-term dating or casual sex # # ###Step 2: # Open general browse page and set search criteria. # # **Straight Male:** # * Man # * Interested in woman # * Ages 18 to 100 (*Okcupid users must be 18 or older*) # * Located anywhere # * Online in the last month (*active users*) # * Order by special blend # # **Straight Female:** # * Woman # * Interested in man # * Ages 18 to 100 (*Okcupid users must be 18 or older*) # * Located anywhere # * Online in the last month (*active users*) # * Order by special blend # # **Bisexual All Gender:** # * All genders # * Interested in everyone # * Ages 18 to 100 (*Okcupid users must be 18 or older*) # * Located anywhere # * Online in the last month (*active users*) # * Order by special blend # # ###Step 3 # Copy the search page URL, and login response cookies to the code. # In[ ]: # This function did not work for some reasons. I have to manually put my browser cookies. def getCookies(): req = requests.post('https://www.okcupid.com/login',\ data={'login_username': 'uhohcantletyouknow', 'login_password':'uhohcantletyouknow'}) cookies = req.cookies return cookies cookies = cookies # In[ ]: # get a maximum of 30 usernames from one search def getUsernames(): url = 'http://www.okcupid.com/match?filter1=0,48&filter2=2,100,18&filter3=5,26\ 78400&filter4=1,1&locid=0&timekey=1&matchOrderBy=SPECIAL_BLEND&custom_search=0&\ fromWhoOnline=0&mygender=m&update_prefs=1&sort_type=0&sa=1&using_saved_search=&count=30' page = requests.get(url, cookies = cookies).text soup = BeautifulSoup(page, 'html5lib') result = soup.find_all('div', {'class':'match_card_wrapper user-not-hidden '}) roughnames = [i.get('id') for i in result] usernames = [re.findall('usr-(.*)-wrapper', i)[0] for i in roughnames] return usernames # repeat the above search multiple times def getLotsUsernames(): usernames = [] for i in range(1000): # 1000 pages * 30 usernames per page = about 30,000 usernames usernames += getUsernames() print 'Scraped', i, 'of 1000 targeted pages.' unique = set(usernames) print 'Downloaded %d usernames, of which %d are unique.' % (len(usernames), len(unique)) return unique # # Part I. Scrap usernames and save to files. # # ###Why? # # The usernames are used to generate unique user profile URL. Scraping 300 pages (9,000 usernames not including duplicates) for each gender takes over an hour. Just to be safe, here I saved the scraped usernames into csv files. # # *Scraping part is done on my local server for speed purpose.* # # * Straight Male Users: # Downloaded **30,000** usernames, of which **20,565** are unique. # Unique percentage: **68.55%** # # * Straight Female Users: # Downloaded **30,000** usernames, of which **24,195** are unique. # Unique percentage: **80.65%** # # * Bisexual All Gender Users: # Downloaded **300,000** usernames, of which **23,565** are unique. # Unique percentage: **7.855%** # In[ ]: get_ipython().run_cell_magic('time', '', 'usernames = getLotsUsernames()\n') # In[ ]: usernames = list(usernames) for i in range(len(usernames)): usernames[i] = usernames[i].encode('utf-8') # In[ ]: # write usernames into new file def writeUsernames(usernames): string = '' for i in usernames: string += i+'\n' with open('usernames.txt', 'w') as f: f.write(string) print len(usernames), 'of usernames have been written into usernames.txt.' # rewrite the file with unique usernames if redundancies are found after multiple scraping attempts def appendUsernames(usernames): string = '' for i in usernames: string += i+'\n' with open('usernames.txt', 'a') as f: f.write(string) print len(usernames), 'of usernames have been added into usernames.txt.' # In[ ]: appendUsernames(usernames) # In[2]: plt.rcParams['figure.figsize'] = 6,6 a = {'Straight Male': 2054, 'Straight Female':2412, 'Bisexuel': 235} a = pd.Series(a.values(), index = a.keys()) a.plot(kind = 'pie', colors=('dodgerblue','#F08080','mediumpurple')) plt.title('Gender Distribution of My Dataset', fontsize=25) figure(figsize=(8,8)) # # Part II. Scrap user information from unique profile page # # ###1. Iterate though the usernames list, and generate unique user profile link. # Number of total profiles scraped: # # Straight Male: # 2,054 # # Straight Female: # 2,412 # # Bisexual All Gender: # 782 # # ###2. Three groups of user infromation: # # --User basic information: gender, age, location, orientation, ethnicities, height, bodytype, diet, smoking, drinking, drugs, religion, sign, education, job, income, status, monogamous, children, pets, languages # # --User matching information: gender orientation, age range, location, single, purpose # # --User self-description: summary, what they are currently doing, what they are good at, noticeable facts, favourite books/movies, things they can’t live without, how to spend time, friday activities, private thing, message preference # # ###3. Contents are stored in a dictionary, which is converted to a pandas DataFrame and then exported to a csv file. # In[ ]: def getProfile(num, username): result = {} for num in range(num): url = 'http://www.okcupid.com/profile/'+username[num] test = requests.get(url, cookies = cookies) if test.status_code == 200: page = requests.get(url, cookies=cookies).text soup = BeautifulSoup(page) # user basic information result.setdefault('username', []) result.setdefault('gender', []) result.setdefault('age', []) result.setdefault('location', []) result.setdefault('frequency', []) result['username'].append(username[num]) result['gender'].append(soup.find_all('span',{'class':'ajax_gender'})[0].get_text()) result['age'].append(soup.find_all('span',{'id':'ajax_age'})[0].get_text()) result['location'].append(soup.find_all('span',{'id':'ajax_location'})[0].get_text()) result['frequency'].append(soup.find_all('div',{'class':'tooltip_text hidden'})[0].get_text()) basic = ['orientation','ethnicities','height','bodytype','diet','smoking',\ 'drinking','drugs','religion','sign','education','job','income','status',\ 'monogamous','children', 'pets','languages'] for i in basic: result.setdefault(i, []) x = soup.find_all('dd', {'id':'ajax_'+i}) if x == []: result[i].append('') else: result[i].append(x[0].get_text()) # user matching information find = ['gentation','ages','near','single','lookingfor'] for i in find: result.setdefault(i, []) x = soup.find_all('li', {'id':'ajax_'+i}) if x == []: result[i].append('') else: result[i].append(x[0].get_text()) # user self description information text = ['0','1','2','3','4','5','6','7','8','9'] for i in text: result.setdefault(i, []) x = soup.find_all('div', {'id':'essay_text_'+i}) if x == []: result[i].append('') else: result[i].append(x[0].get_text()) print num, 'of', len(username), test.status_code == 200 return result # In[ ]: l =[] with open('usernames.txt', 'r') as f: for line in f: l.append(line.rstrip('\n')) s = set(l) print len(l), 'of usernames have been added to the usernames list.' print len(s), "of them are unique." # rewrite the file if there were redundancies if len(l) != len(s): writeUsernames(s) print 'usernames.txt file has been rewrriten.' l = list(s) # In[ ]: # Set the number of usernames to scrape result = getProfile(len(l), l) profile = pd.DataFrame(result) profile = profile.rename(columns = {'0':'0summary','1':'1doing','2':'2goodat','3':'3notice',\ '4':'4books','5':'5without','6':'6spendtime','7':'7friday','8':'8private','9':'9message'}) profile = profile.set_index(['username']) print profile.columns # In[ ]: # Export the profiles to csv profile.to_csv('profile.csv',encoding='utf-8') get_ipython().system('head -5 profile.csv') # In[ ]: profile # # Part III. Data analysis # # ###1. Upload saved straight male, straight female, and bisexual profile csv files to pandas datafram for data manipulation. # # ###2. Demographics Analysis # * How old are they? Age distribution of each group. # * Where are they located? Top locations of each group. # # ###3. Psychological Analysis # * Who are pickier? Age ranges set by different groups of users compared to their own ages. # * Who are possibly lying? Average height of different groups of users compared to CDC data. # # ###4. Notes # * The user age distributions observed are much older than other online reports. This is possibly affected by the login profile setting. I've set my robot profile as a 46 year old man located in China. From this we can learn that the system is still using my profile setting as a refrence, even if I've indicated that I'm open to people from all ages. # * The factors contributing to the height differences could also be: 1) Biased data collection. 2) People who use Okcupid really are taller than the average! # In[3]: p = pd.read_table('profiles(male).csv', sep=',') p.groupby([p.gender]).size() # In[4]: p2 = pd.read_table('profiles(female).csv', sep=',') p2.groupby([p2.gender]).size() # In[5]: p3 = pd.read_table('profile.csv', sep=',') p3.groupby([p3.gender]).size() # In[6]: get_ipython().run_line_magic('pylab', 'inline') plt.rcParams['figure.figsize'] = 12, 6 sns.distplot(p.age, color = "dodgerblue") plt.xlabel('Age', fontsize = 20) plt.ylabel('Density', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.xlim(18,80) plt.title('Straight Male Users Age Density', fontsize = 25) plt.text(55, 0.035, '2056 users', fontsize = 20) plt.text(55, 0.03, 'Average age: 44', fontsize = 20) print '\n\n','The average age of', len(p), 'staright female users is', round(mean(p.age)) # In[7]: plt.rcParams['figure.figsize'] = 12, 6 sns.distplot(p2.age, color = '#FF4D4D') plt.xlabel('Age', fontsize = 20) plt.ylabel('Density', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.xlim(18,60) plt.title('Straight Female Users Age Density', fontsize = 25) plt.text(45, 0.10, '2412 users', fontsize = 20) plt.text(45, 0.08, 'Average age: 35', fontsize = 20) print '\n\n','The average age of', len(p2), 'staright female users is', round(mean(p2.age)) # In[8]: plt.rcParams['figure.figsize'] = 12, 6 sns.distplot(p3.age, color = 'mediumpurple') plt.xlabel('Age', fontsize = 20) plt.ylabel('Density', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.xlim(18,50) plt.title('Bisexual Mixed Gender Users Age Density', fontsize = 25) plt.text(37, 0.09, '782 users', fontsize = 20) plt.text(37, 0.08, 'Average age: 26', fontsize = 20) print '\n\n','The average age of', len(p3), 'staright female users is', round(mean(p3.age)) # In[11]: def locationranks(df): ranks = {} for i in df.location: x = re.split(', ', i)[-1] ranks[x] = ranks.get(x, 0) + 1 ranks = pd.Series(ranks.values(), index = ranks.keys()) ranks = ranks.order(ascending=False)[:10] return ranks plt.rcParams['figure.figsize'] = 12,12 locationranks(p).plot(kind='bar', fontsize = 14, legend=False, color = 'dodgerblue') plt.title('Top 10 Cities of Straight Male Users', fontsize=25) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.style.use('ggplot') # In[10]: plt.rcParams['figure.figsize'] = 12,12 locationranks(p2).plot(kind = 'bar', fontsize = 14, legend=False) plt.title('Top 10 Cities of Straight Female Users', fontsize=25) plt.xticks(fontsize=20) plt.yticks(fontsize=20) # In[12]: plt.rcParams['figure.figsize'] = 12,12 locationranks(p3).plot(kind = 'bar', fontsize = 14, legend=False, color = 'mediumpurple') plt.title('Top 10 Cities of Mixed Gender Bisexual Users', fontsize=25) plt.xticks(fontsize=20) plt.yticks(fontsize=20) # In[9]: young = [] old = [] for i in p.index: y = int(re.findall('(\d\d)–', p.ages[i])[0]) o = int(re.findall('–(\d\d)', p.ages[i])[0]) young.append(y) old.append(o) young = pd.Series(young) old = pd.Series(old) agerange = pd.DataFrame(p.age, columns=['age']) agerange['young'] = young agerange['old'] = old print len(agerange) plt.rcParams['figure.figsize'] = 12, 6 plot = agerange.groupby(agerange.age).mean() plt.plot(plot.index, plot.index, color='blue', label = 'His age') plt.plot(plot.index, plot.young, color='dodgerblue', label = 'Her youngest age') plt.plot(plot.index, plot.old, color='dodgerblue', label = 'Her oldest age') plot.old[18] = 18 plot.young[99] = 99 plt.fill(plot.index, plot.old, color='dodgerblue', alpha = 0.3) plt.fill(plot.index, plot.young, color='dodgerblue', alpha = 0.3) plt.xlabel('His Age', fontsize = 20) plt.ylabel('Her Age Range', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.title('Age Preferences of Straight Men', fontsize = 25) plt.xlim(18,60) plt.legend(loc = 4) # In[10]: young = [] old = [] for i in p2.index: y = int(re.findall('(\d\d)–', p2.ages[i])[0]) o = int(re.findall('–(\d\d)', p2.ages[i])[0]) young.append(y) old.append(o) young = pd.Series(young) old = pd.Series(old) agerange = pd.DataFrame(p2.age, columns=['age']) agerange['young'] = young agerange['old'] = old print len(agerange) plot = agerange.groupby(agerange.age).mean() plot = plot.loc[plot.index <= 60,] plt.rcParams['figure.figsize'] = 12, 6 plt.plot(plot.index, plot.index, color='red', label = 'Her age') plt.plot(plot.index, plot.young, color='#F08080', label = 'His youngest age') plt.plot(plot.index, plot.old, color='#F08080', label = 'His oldest age') plot.loc[18] = 18 plot.young[60] = 60 plt.fill(plot.index, plot.old, color='#F08080', alpha = 0.3) plt.fill(plot.index, plot.young, color='#F08080', alpha = 0.3) plt.xlabel('Her Age', fontsize = 20) plt.ylabel('His Age Range', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.title('Age Preferences of Straight Women', fontsize = 25) plt.xlim(18,60) plt.legend(loc = 4) # In[11]: young = [] old = [] for i in p3.index: y = int(re.findall('(\d\d)–', p3.ages[i])[0]) o = int(re.findall('–(\d\d)', p3.ages[i])[0]) young.append(y) old.append(o) young = pd.Series(young) old = pd.Series(old) agerange = pd.DataFrame(p3.age, columns=['age']) agerange['young'] = young agerange['old'] = old print len(agerange) plot = agerange.groupby(agerange.age).mean() plot = plot.loc[plot.index <= 60,] plt.rcParams['figure.figsize'] = 12, 6 plt.plot(plot.index, plot.index, color='purple', label = 'His/Her age') plt.plot(plot.index, plot.young, color='mediumpurple', label = 'His/Her youngest age') plt.plot(plot.index, plot.old, color='mediumpurple', label = 'His/Her oldest age') plot.loc[18] = 18 plot.old[57] = 57 plot.young[57] = 57 plt.fill(plot.index, plot.old, color='mediumpurple', alpha = 0.3) plt.fill(plot.index, plot.young, color='mediumpurple', alpha = 0.3) plt.xlabel('His/Her Age', fontsize = 20) plt.ylabel('His/Her Age Range', fontsize = 20) plt.xticks(np.arange(20,85,5), fontsize=20) plt.yticks(fontsize=20) plt.title('Age Preferences of Bisexual Mixed Gender Users', fontsize = 25) plt.xlim(18,60) plt.legend(loc = 4) # In[12]: get_ipython().run_line_magic('pylab', 'inline') man = p[(p.gender == 'Man') & (p.age >= 20)] lst=[] count = 0 for i in man.height: if type(i) == str: lst.append(int(float(re.findall('(\d.\d\d)m', i)[0])*100)) count += 1 print count lst = pd.Series(lst) plt.rcParams['figure.figsize'] = 12, 6 sns.kdeplot(lst, shade=True, label='Self-reported Height on Okcupid (20 years and older)', color = 'dodgerblue') seq = np.linspace(130,220,100) plt.plot(seq, stats.norm.pdf(seq, loc =175.9, scale = np.sqrt(5647)*0.2/2), color='red', label = 'Average Height from CDC (20 years and older)') plt.legend(loc = 1) plt.xticks(np.arange(140,220,5), fontsize=15) plt.yticks(fontsize=20) plt.xlabel('Height(cm)', fontsize = 20) plt.ylabel('Density', fontsize = 20) plt.title('Average Height of Men Compared to Reality', fontsize = 25) plt.xlim(18,60) plt.legend(loc = 1) plt.xlim(145, 210) # In[13]: woman = p2[(p2.gender == 'Woman') & (p2.age >= 20)] lst=[] count = 0 for i in woman.height: if type(i) == str: lst.append(int(float(re.findall('(\d.\d\d)m', i)[0])*100)) count += 1 print count lst = pd.Series(lst) plt.rcParams['figure.figsize'] = 12, 6 sns.kdeplot(lst, shade=True, label='Self-reported Height on Okcupid (20 years and older)', color = '#F08080') seq = np.linspace(130,220,100) plt.plot(seq, stats.norm.pdf(seq, loc =162.1, scale = np.sqrt(5971)*0.14/1.5), color='red', label = 'Average Height from CDC (20 years and older)') plt.legend(loc = 1) plt.xticks(np.arange(140,220,5), fontsize=15) plt.yticks(fontsize=20) plt.xlabel('Height(cm)', fontsize = 20) plt.ylabel('Density', fontsize = 20) plt.title('Average Height of Women Compared to Reality', fontsize = 25) plt.xlim(18,60) plt.legend(loc = 1) plt.xlim(140, 210) # In[ ]: # Add latitude and longitude information on each profile based on the city location get_ipython().run_line_magic('time', '') latitude = [] longitude = [] for i in p2.index: print i, 'of', len(p2.index), 'finished.' geolocator = Nominatim() try: location = geolocator.geocode(p2.location[i]) latitude.append(location.latitude) longitude.append(location.longitude) except: latitude.append(0) longitude.append(0) p2['latitude'] = latitude p2['longitude'] = longitude p2.head() # In[27]: p2.to_csv('profile(female)geo.csv',encoding='utf-8') # In[14]: # Below are word count functions that I keep it here for later text analysis def wordCount(raw, punctuation='\n', ignoreCase = True): raw = raw.lower() for i in string.punctuation: raw = raw.replace(i, '') if punctuation != None: for i in punctuation: raw = raw.replace(i, ' ') raw_list = raw.split() result = {} for word in raw_list: result[word] = result.get(word, 0) + 1 return result def wordCountSort(raw, descend = True, punctuation='\n', ignoreCase = True): result = wordCount(raw, punctuation=punctuation, ignoreCase = ignoreCase) result = sorted(result.items(), key = lambda x: x[1] , reverse = descend) return result def mostCommonWord(raw, num=5, punctuation='\n', descend = True, ignoreCase = True): result = wordCountSort(raw, punctuation=punctuation, ignoreCase = ignoreCase, descend = descend) if num > len(result): Warning('There are only %s words'%len(result)) return result else: return result[:num]