## all imports from IPython.display import HTML import numpy as np import urllib2 import bs4 #this is beautiful soup from pandas import Series import pandas as pd from pandas import DataFrame import matplotlib import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set_context("talk") sns.set_style("white") User-agent: Google Disallow: User-agent: * Disallow: / s = """ This is a title

Test

Hello world!

""" h = HTML(s) h url = 'http://www.crummy.com/software/BeautifulSoup' source = urllib2.urlopen(url).read() #print source ## is 'Alice' in source? print 'Alice' in source ## count occurences of 'Soup' print source.count('Soup') ## find index of 'alien video games' position = source.find('alien video games') print position ## quick test to see the substring in the source variable ## you can access strings like lists print source[position:position + 20] ## or the tidier version: print source[position:position + len('alien video games')] ## get bs4 object soup = bs4.BeautifulSoup(source) ## compare the two print statements # print soup # print soup.prettify() ## show how to find all a tags soup.findAll('a') ## ***Why does this not work? *** soup.findAll('Soup') ## get attribute value from an element: ## find tag: this only returns the first occurrence, not all tags in the string first_tag = soup.find('a') ## get attribute `href` first_tag.get('href') ## get all links in the page link_list = [l.get('href') for l in soup.findAll('a')] ## filter all external links # create an empty list to collect the valid links external_links = [] # write a loop to filter the links # if it starts with 'http' we are happy for l in link_list: if l[:4] == 'http': external_links.append(l) # this throws an error! It says something about 'NoneType' # lets investigate. Have a close look at the link_list: link_list # Seems that there are None elements! # Let's verify print sum([l is None for l in link_list]) # So there are two elements in the list that are None! # Let's filter those objects out in the for loop external_links = [] # write a loop to filter the links # if it is not None and starts with 'http' we are happy for l in link_list: if l is not None and l[:4] == 'http': external_links.append(l) external_links # another option for the if statement # didn't know about the startswith function until it was pointed out in class. Thanks! # and we can put this in a list comprehension as well, it almost reads like sentence. [l for l in link_list if l is not None and l.startswith('http')] # redifining `s` without any line breaks s = """This is a title

Test

Hello world!

""" ## get bs4 object tree = bs4.BeautifulSoup(s) ## get html root node root_node = tree.html ## get head from root using contents head = root_node.contents[0] ## get body from root body = root_node.contents[1] ## could directly access body tree.body ## get h3 tag from body body.contents[0] ## use ul as entry point entry_point = soup.find('ul') ## get hall of fame list from entry point ## skip the first entry hall_of_fame_list = entry_point.contents[1:] ## reformat into a list containing strings tmp = [] for li in hall_of_fame_list: tmp.append(li.contents) test = ["".join(str(a) for a in sublist) for sublist in tmp] print '\n'.join(test) stuff_i_like = ['burger', 'sushi', 'sweet potato fries', 'BBQ'] found_happy_hours = [] my_happy_hours = [] # First, I'm going to identify the areas of the page I want to look at url = 'http://www.downtownla.com/3_10_happyHours.asp?action=ALL' source = urllib2.urlopen(url).read() tables = bs4.BeautifulSoup(source) # Then, I'm going to sort out the *exact* parts of the page # that match what I'm looking for... for t in tables.findAll('p', {'class': 'calendar_EventTitle'}): text = t.text for s in t.findNextSiblings(): text += '\n' + s.text found_happy_hours.append(text) print "The scraper found %d happy hours!" % len(found_happy_hours) # Now I'm going to loop through the food I like # and see if any of the happy hour descriptions match for food in stuff_i_like: for hh in found_happy_hours: # checking for text AND making sure I don't have duplicates if food in hh and hh not in my_happy_hours: print "YAY! I found some %s!" % food my_happy_hours.append(hh) print "I think you might like %d of them, yipeeeee!" % len(my_happy_hours) # Now, let's make a mail message we can read: message = 'Hey Katharine,\n\n\n' message += 'OMG, I found some stuff for you in Downtown, take a look.\n\n' message += '==============================\n'.join(my_happy_hours) message = message.encode('utf-8') # To read more about encoding: # http://diveintopython.org/xml_processing/unicode.html message = message.replace('\t', '').replace('\r', '') message += '\n\nXOXO,\n Your Py Script' #print message import json import requests api_key = 'your key here' url = 'http://api.rottentomatoes.com/api/public/v1.0/lists/dvds/top_rentals.json?apikey=' + api_key data = urllib2.urlopen(url).read() #print data a = {'a': 1, 'b':2} print a #show keys print a.keys() #show values print a.values() #show for loop over all entries # option 1 using zip # this works also for iterating over any # other two lists for k,v in zip(a.keys(), a.values()): print k,v # option 2 using the dictionary `iteritems()` function for k,v in a.iteritems(): print k,v a = {'a': 1, 'b':2} s = json.dumps(a) a2 = json.loads(s) ## a is a dictionary #a ## vs s is a string containing a in JSON encoding #s ## reading back the keys are now in unicode #a2 ## create dictionary from JSON dataDict = json.loads(data) ## expore dictionary print dataDict.keys() ## there is a key named `movies` containing a list of movies as a value movies = dataDict['movies'] ## each element of the list `movies` is a dictionary print movies[0].keys() ## one of the keys is called `ratings` ## the value is yet another dictionary print movies[0]['ratings'].keys() ## so we made it all the way to find the critics score print movies[0]['ratings']['critics_score'] # critics scores list critics_scores = [m['ratings']['critics_score'] for m in movies] # audience scores list audience_scores = [m['ratings']['audience_score'] for m in movies] ## create pandas data frame with critics and audience score scores = pd.DataFrame(data=[critics_scores, audience_scores]).transpose() scores.columns = ['critics', 'audience'] ## also create a list with all movie titles movie_titles = [m['title'] for m in movies] ## set index of dataFrame BEWARE of inplace! scores.set_index([movie_titles]) ## the line above does not changes scores! ## You need to either reassign scores = scores.set_index([movie_titles]) ## or set the inplace argument to True scores.set_index([movie_titles], inplace=True) scores.head(3) ## create a bar plot with the data scores.plot(kind = 'bar') ## set the title to Score Comparison plt.title('Score Comparison') ## set the x label plt.xlabel('Movies') ## set the y label plt.ylabel('Scores') ## show the plot plt.show() import twitter ## define the necessary keys cKey = 'your consumer key here' cSecret = 'your consumer secret here' aKey = 'your access token key here' aSecret = 'your access token secret here' ## create the api object with the twitter-python library api = twitter.Api(consumer_key=cKey, consumer_secret=cSecret, access_token_key=aKey, access_token_secret=aSecret) ## get the user timeline with screen_name = 'rafalab' twitter_statuses = api.GetUserTimeline(screen_name = 'rafalab') ## create a data frame ## first get a list of panda Series or dict pdSeriesList = [pd.Series(t.AsDict()) for t in twitter_statuses] ## then create the data frame data = pd.DataFrame(pdSeriesList) data.head(2) ## filter tweets with enough retweet_count maybe_interesting = data[data.retweet_count>10] ## get the text of these tweets tweet_text = maybe_interesting.text ## print them out text = tweet_text.values for t in text: print '######' print t ## create a view for favorite_count on maybe_interesting view = maybe_interesting['favorite_count'] print '-----------------' print "This is view:" print view ## change a value view[8] = 9999 ## look at original frame print '-----------------' print "This is view after changing view[8]" print view print '-----------------' print "This is maybe_interesting after changing view[8]" print "It changed too!" print maybe_interesting['favorite_count'] ## to avoid this you can use copy independent_data = maybe_interesting['favorite_count'].copy() independent_data[10] = 999 print '-----------------' print "This is independent_data after changed at 10:" print independent_data print '-----------------' print "This is maybe_interesting after changing independent_data:" print "It did not change because we only changed a copy of it" print maybe_interesting['favorite_count'] import scipy ## seeding the random generate so we always get the same random numbers np.random.seed(seed=99) # make some data up mean = [0,0] cov = [[1.0,0.7],[0.7,1.0]] x,y = np.random.multivariate_normal(mean,cov,500).T # plot the data fig = plt.figure() plt.scatter(x,y) plt.axis('equal') plt.show() ## create a data matrix matrix = np.column_stack((x,y)) ## compute SVD U,s,Vh = scipy.linalg.svd(matrix) ## blow up s S = scipy.linalg.diagsvd(s, 500, 2) ## reconstruct the data (sanity test) reconstruction = np.dot(U, np.dot(S, Vh)) ## check the results print matrix[1,:] print reconstruction[1,:] ## the allclose() function allows for the data points to deviate by a small ## epsilon and still be considered correctly reconstructed. print np.allclose(matrix, reconstruction) # show the column vectors of V V = Vh.T plt.scatter(x, y) plt.plot([0, V[0,0]], [0,V[1,0]], c='r', linewidth=10.0) plt.plot([0, V[0,1]], [0,V[1,1]], c='y', linewidth=10.0) plt.axis('equal') plt.show() # two ways to project the data projection = np.dot(U, S[:,:1]) projection2 = np.dot(matrix, V[:,:1]) np.allclose(projection, projection2) # compare the plots plt.clf() zeros = np.zeros_like(projection) plt.scatter(projection, zeros, c='r', zorder=2) plt.scatter(x,y,c='b', zorder=2) for px, py, proj in zip(x,y,projection): plt.plot([px,proj],[py,0],c='0.5', linewidth=1.0, zorder=1) plt.axis('equal') plt.show() ## try to reconstruct back to 2D ## just a reminder projection = np.dot(U, S[:,:1]) ## now the reconstruction reconstruction = np.dot(projection, Vh[:1,:]) reconstruction.shape # compare the plots plt.clf() zeros = np.zeros_like(projection) plt.scatter(reconstruction[:,0], reconstruction[:,1], c='r', zorder=2) plt.scatter(x,y,c='b', zorder=2) for px, py, rx,ry in zip(x,y,reconstruction[:,0], reconstruction[:,1]): plt.plot([px,rx],[py,ry],c='0.5', linewidth=1.0, zorder=1) plt.axis('equal') plt.show()